source: trunk/src/corelib/tools/qregexp.cpp@ 240

Last change on this file since 240 was 2, checked in by Dmitry A. Kuminov, 16 years ago

Initially imported qt-all-opensource-src-4.5.1 from Trolltech.

File size: 125.9 KB
Line 
1/****************************************************************************
2**
3** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
4** Contact: Qt Software Information ([email protected])
5**
6** This file is part of the QtCore module of the Qt Toolkit.
7**
8** $QT_BEGIN_LICENSE:LGPL$
9** Commercial Usage
10** Licensees holding valid Qt Commercial licenses may use this file in
11** accordance with the Qt Commercial License Agreement provided with the
12** Software or, alternatively, in accordance with the terms contained in
13** a written agreement between you and Nokia.
14**
15** GNU Lesser General Public License Usage
16** Alternatively, this file may be used under the terms of the GNU Lesser
17** General Public License version 2.1 as published by the Free Software
18** Foundation and appearing in the file LICENSE.LGPL included in the
19** packaging of this file. Please review the following information to
20** ensure the GNU Lesser General Public License version 2.1 requirements
21** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
22**
23** In addition, as a special exception, Nokia gives you certain
24** additional rights. These rights are described in the Nokia Qt LGPL
25** Exception version 1.0, included in the file LGPL_EXCEPTION.txt in this
26** package.
27**
28** GNU General Public License Usage
29** Alternatively, this file may be used under the terms of the GNU
30** General Public License version 3.0 as published by the Free Software
31** Foundation and appearing in the file LICENSE.GPL included in the
32** packaging of this file. Please review the following information to
33** ensure the GNU General Public License version 3.0 requirements will be
34** met: http://www.gnu.org/copyleft/gpl.html.
35**
36** If you are unsure which license is appropriate for your use, please
37** contact the sales department at [email protected].
38** $QT_END_LICENSE$
39**
40****************************************************************************/
41
42#include "qregexp.h"
43
44#include "qalgorithms.h"
45#include "qbitarray.h"
46#include "qcache.h"
47#include "qdatastream.h"
48#include "qlist.h"
49#include "qmap.h"
50#include "qmutex.h"
51#include "qstring.h"
52#include "qstringlist.h"
53#include "qstringmatcher.h"
54#include "qvector.h"
55
56#include <limits.h>
57
58QT_BEGIN_NAMESPACE
59
60int qFindString(const QChar *haystack, int haystackLen, int from,
61 const QChar *needle, int needleLen, Qt::CaseSensitivity cs);
62
63// error strings for the regexp parser
64#define RXERR_OK QT_TRANSLATE_NOOP("QRegExp", "no error occurred")
65#define RXERR_DISABLED QT_TRANSLATE_NOOP("QRegExp", "disabled feature used")
66#define RXERR_CHARCLASS QT_TRANSLATE_NOOP("QRegExp", "bad char class syntax")
67#define RXERR_LOOKAHEAD QT_TRANSLATE_NOOP("QRegExp", "bad lookahead syntax")
68#define RXERR_REPETITION QT_TRANSLATE_NOOP("QRegExp", "bad repetition syntax")
69#define RXERR_OCTAL QT_TRANSLATE_NOOP("QRegExp", "invalid octal value")
70#define RXERR_LEFTDELIM QT_TRANSLATE_NOOP("QRegExp", "missing left delim")
71#define RXERR_END QT_TRANSLATE_NOOP("QRegExp", "unexpected end")
72#define RXERR_LIMIT QT_TRANSLATE_NOOP("QRegExp", "met internal limit")
73
74/*
75 WARNING! Be sure to read qregexp.tex before modifying this file.
76*/
77
78/*!
79 \class QRegExp
80 \reentrant
81 \brief The QRegExp class provides pattern matching using regular expressions.
82
83 \ingroup tools
84 \ingroup misc
85 \ingroup shared
86 \mainclass
87 \keyword regular expression
88
89 A regular expression, or "regexp", is a pattern for matching
90 substrings in a text. This is useful in many contexts, e.g.,
91
92 \table
93 \row \i Validation
94 \i A regexp can test whether a substring meets some criteria,
95 e.g. is an integer or contains no whitespace.
96 \row \i Searching
97 \i A regexp provides more powerful pattern matching than
98 simple substring matching, e.g., match one of the words
99 \e{mail}, \e{letter} or \e{correspondence}, but none of the
100 words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc.
101 \row \i Search and Replace
102 \i A regexp can replace all occurrences of a substring with a
103 different substring, e.g., replace all occurrences of \e{&}
104 with \e{\&amp;} except where the \e{&} is already followed by
105 an \e{amp;}.
106 \row \i String Splitting
107 \i A regexp can be used to identify where a string should be
108 split apart, e.g. splitting tab-delimited strings.
109 \endtable
110
111 A brief introduction to regexps is presented, a description of
112 Qt's regexp language, some examples, and the function
113 documentation itself. QRegExp is modeled on Perl's regexp
114 language. It fully supports Unicode. QRegExp can also be used in a
115 simpler, \e{wildcard mode} that is similar to the functionality
116 found in command shells. The syntax rules used by QRegExp can be
117 changed with setPatternSyntax(). In particular, the pattern syntax
118 can be set to QRegExp::FixedString, which means the pattern to be
119 matched is interpreted as a plain string, i.e., special characters
120 (e.g., backslash) are not escaped.
121
122 A good text on regexps is \e {Mastering Regular Expressions}
123 (Third Edition) by Jeffrey E. F. Friedl, ISBN 0-596-52812-4.
124
125 \tableofcontents
126
127 \section1 Introduction
128
129 Regexps are built up from expressions, quantifiers, and
130 assertions. The simplest expression is a character, e.g. \bold{x}
131 or \bold{5}. An expression can also be a set of characters
132 enclosed in square brackets. \bold{[ABCD]} will match an \bold{A}
133 or a \bold{B} or a \bold{C} or a \bold{D}. We can write this same
134 expression as \bold{[A-D]}, and an experession to match any
135 captital letter in the English alphabet is written as
136 \bold{[A-Z]}.
137
138 A quantifier specifies the number of occurrences of an expression
139 that must be matched. \bold{x{1,1}} means match one and only one
140 \bold{x}. \bold{x{1,5}} means match a sequence of \bold{x}
141 characters that contains at least one \bold{x} but no more than
142 five.
143
144 Note that in general regexps cannot be used to check for balanced
145 brackets or tags. For example, a regexp can be written to match an
146 opening html \c{<b>} and its closing \c{</b>}, if the \c{<b>} tags
147 are not nested, but if the \c{<b>} tags are nested, that same
148 regexp will match an opening \c{<b>} tag with the wrong closing
149 \c{</b>}. For the fragment \c{<b>bold <b>bolder</b></b>}, the
150 first \c{<b>} would be matched with the first \c{</b>}, which is
151 not correct. However, it is possible to write a regexp that will
152 match nested brackets or tags correctly, but only if the number of
153 nesting levels is fixed and known. If the number of nesting levels
154 is not fixed and known, it is impossible to write a regexp that
155 will not fail.
156
157 Suppose we want a regexp to match integers in the range 0 to 99.
158 At least one digit is required, so we start with the expression
159 \bold{[0-9]{1,1}}, which matches a single digit exactly once. This
160 regexp matches integers in the range 0 to 9. To match integers up
161 to 99, increase the maximum number of occurrences to 2, so the
162 regexp becomes \bold{[0-9]{1,2}}. This regexp satisfies the
163 original requirement to match integers from 0 to 99, but it will
164 also match integers that occur in the middle of strings. If we
165 want the matched integer to be the whole string, we must use the
166 anchor assertions, \bold{^} (caret) and \bold{$} (dollar). When
167 \bold{^} is the first character in a regexp, it means the regexp
168 must match from the beginning of the string. When \bold{$} is the
169 last character of the regexp, it means the regexp must match to
170 the end of the string. The regexp becomes \bold{^[0-9]{1,2}$}.
171 Note that assertions, e.g. \bold{^} and \bold{$}, do not match
172 characters but locations in the string.
173
174 If you have seen regexps described elsewhere, they may have looked
175 different from the ones shown here. This is because some sets of
176 characters and some quantifiers are so common that they have been
177 given special symbols to represent them. \bold{[0-9]} can be
178 replaced with the symbol \bold{\\d}. The quantifier to match
179 exactly one occurrence, \bold{{1,1}}, can be replaced with the
180 expression itself, i.e. \bold{x{1,1}} is the same as \bold{x}. So
181 our 0 to 99 matcher could be written as \bold{^\\d{1,2}$}. It can
182 also be written \bold{^\\d\\d{0,1}$}, i.e. \e{From the start of
183 the string, match a digit, followed immediately by 0 or 1 digits}.
184 In practice, it would be written as \bold{^\\d\\d?$}. The \bold{?}
185 is shorthand for the quantifier \bold{{0,1}}, i.e. 0 or 1
186 occurrences. \bold{?} makes an expression optional. The regexp
187 \bold{^\\d\\d?$} means \e{From the beginning of the string, match
188 one digit, followed immediately by 0 or 1 more digit, followed
189 immediately by end of string}.
190
191 To write a regexp that matches one of the words 'mail' \e or
192 'letter' \e or 'correspondence' but does not match words that
193 contain these words, e.g., 'email', 'mailman', 'mailer', and
194 'letterbox', start with a regexp that matches 'mail'. Expressed
195 fully, the regexp is \bold{m{1,1}a{1,1}i{1,1}l{1,1}}, but because
196 a character expression is automatically quantified by
197 \bold{{1,1}}, we can simplify the regexp to \bold{mail}, i.e., an
198 'm' followed by an 'a' followed by an 'i' followed by an 'l'. Now
199 we can use the vertical bar \bold{|}, which means \bold{or}, to
200 include the other two words, so our regexp for matching any of the
201 three words becomes \bold{mail|letter|correspondence}. Match
202 'mail' \bold{or} 'letter' \bold{or} 'correspondence'. While this
203 regexp will match one of the three words we want to match, it will
204 also match words we don't want to match, e.g., 'email'. To
205 prevent the regexp from matching unwanted words, we must tell it
206 to begin and end the match at word boundaries. First we enclose
207 our regexp in parentheses, \bold{(mail|letter|correspondence)}.
208 Parentheses group expressions together, and they identify a part
209 of the regexp that we wish to \l{capturing text}{capture}.
210 Enclosing the expression in parentheses allows us to use it as a
211 component in more complex regexps. It also allows us to examine
212 which of the three words was actually matched. To force the match
213 to begin and end on word boundaries, we enclose the regexp in
214 \bold{\\b} \e{word boundary} assertions:
215 \bold{\\b(mail|letter|correspondence)\\b}. Now the regexp means:
216 \e{Match a word boundary, followed by the regexp in parentheses,
217 followed by a word boundary}. The \bold{\\b} assertion matches a
218 \e position in the regexp, not a \e character. A word boundary is
219 any non-word character, e.g., a space, newline, or the beginning
220 or ending of a string.
221
222 If we want to replace ampersand characters with the HTML entity
223 \bold{\&amp;}, the regexp to match is simply \bold{\&}. But this
224 regexp will also match ampersands that have already been converted
225 to HTML entities. We want to replace only ampersands that are not
226 already followed by \bold{amp;}. For this, we need the negative
227 lookahead assertion, \bold{(?!}__\bold{)}. The regexp can then be
228 written as \bold{\&(?!amp;)}, i.e. \e{Match an ampersand that is}
229 \bold{not} \e{followed by} \bold{amp;}.
230
231 If we want to count all the occurrences of 'Eric' and 'Eirik' in a
232 string, two valid solutions are \bold{\\b(Eric|Eirik)\\b} and
233 \bold{\\bEi?ri[ck]\\b}. The word boundary assertion '\\b' is
234 required to avoid matching words that contain either name,
235 e.g. 'Ericsson'. Note that the second regexp matches more
236 spellings than we want: 'Eric', 'Erik', 'Eiric' and 'Eirik'.
237
238 Some of the examples discussed above are implemented in the
239 \link #code-examples code examples \endlink section.
240
241 \target characters-and-abbreviations-for-sets-of-characters
242 \section1 Characters and Abbreviations for Sets of Characters
243
244 \table
245 \header \i Element \i Meaning
246 \row \i \bold{c}
247 \i A character represents itself unless it has a special
248 regexp meaning. e.g. \bold{c} matches the character \e c.
249 \row \i \bold{\\c}
250 \i A character that follows a backslash matches the character
251 itself, except as specified below. e.g., To match a literal
252 caret at the beginning of a string, write \bold{\\^}.
253 \row \i \bold{\\a}
254 \i Matches the ASCII bell (BEL, 0x07).
255 \row \i \bold{\\f}
256 \i Matches the ASCII form feed (FF, 0x0C).
257 \row \i \bold{\\n}
258 \i Matches the ASCII line feed (LF, 0x0A, Unix newline).
259 \row \i \bold{\\r}
260 \i Matches the ASCII carriage return (CR, 0x0D).
261 \row \i \bold{\\t}
262 \i Matches the ASCII horizontal tab (HT, 0x09).
263 \row \i \bold{\\v}
264 \i Matches the ASCII vertical tab (VT, 0x0B).
265 \row \i \bold{\\x\e{hhhh}}
266 \i Matches the Unicode character corresponding to the
267 hexadecimal number \e{hhhh} (between 0x0000 and 0xFFFF).
268 \row \i \bold{\\0\e{ooo}} (i.e., \\zero \e{ooo})
269 \i matches the ASCII/Latin1 character for the octal number
270 \e{ooo} (between 0 and 0377).
271 \row \i \bold{. (dot)}
272 \i Matches any character (including newline).
273 \row \i \bold{\\d}
274 \i Matches a digit (QChar::isDigit()).
275 \row \i \bold{\\D}
276 \i Matches a non-digit.
277 \row \i \bold{\\s}
278 \i Matches a whitespace character (QChar::isSpace()).
279 \row \i \bold{\\S}
280 \i Matches a non-whitespace character.
281 \row \i \bold{\\w}
282 \i Matches a word character (QChar::isLetterOrNumber(), QChar::isMark(), or '_').
283 \row \i \bold{\\W}
284 \i Matches a non-word character.
285 \row \i \bold{\\\e{n}}
286 \i The \e{n}-th \l backreference, e.g. \\1, \\2, etc.
287 \endtable
288
289 \bold{Note:} The C++ compiler transforms backslashes in strings.
290 To include a \bold{\\} in a regexp, enter it twice, i.e. \c{\\}.
291 To match the backslash character itself, enter it four times, i.e.
292 \c{\\\\}.
293
294 \target sets-of-characters
295 \section1 Sets of Characters
296
297 Square brackets mean match any character contained in the square
298 brackets. The character set abbreviations described above can
299 appear in a character set in square brackets. Except for the
300 character set abbreviations and the following two exceptions,
301 characters do not have special meanings in square brackets.
302
303 \table
304 \row \i \bold{^}
305
306 \i The caret negates the character set if it occurs as the
307 first character (i.e. immediately after the opening square
308 bracket). \bold{[abc]} matches 'a' or 'b' or 'c', but
309 \bold{[^abc]} matches anything \e but 'a' or 'b' or 'c'.
310
311 \row \i \bold{-}
312
313 \i The dash indicates a range of characters. \bold{[W-Z]}
314 matches 'W' or 'X' or 'Y' or 'Z'.
315
316 \endtable
317
318 Using the predefined character set abbreviations is more portable
319 than using character ranges across platforms and languages. For
320 example, \bold{[0-9]} matches a digit in Western alphabets but
321 \bold{\\d} matches a digit in \e any alphabet.
322
323 Note: In other regexp documentation, sets of characters are often
324 called "character classes".
325
326 \target quantifiers
327 \section1 Quantifiers
328
329 By default, an expression is automatically quantified by
330 \bold{{1,1}}, i.e. it should occur exactly once. In the following
331 list, \bold{\e {E}} stands for expression. An expression is a
332 character, or an abbreviation for a set of characters, or a set of
333 characters in square brackets, or an expression in parentheses.
334
335 \table
336 \row \i \bold{\e {E}?}
337
338 \i Matches zero or one occurrences of \e E. This quantifier
339 means \e{The previous expression is optional}, because it
340 will match whether or not the expression is found. \bold{\e
341 {E}?} is the same as \bold{\e {E}{0,1}}. e.g., \bold{dents?}
342 matches 'dent' or 'dents'.
343
344 \row \i \bold{\e {E}+}
345
346 \i Matches one or more occurrences of \e E. \bold{\e {E}+} is
347 the same as \bold{\e {E}{1,}}. e.g., \bold{0+} matches '0',
348 '00', '000', etc.
349
350 \row \i \bold{\e {E}*}
351
352 \i Matches zero or more occurrences of \e E. It is the same
353 as \bold{\e {E}{0,}}. The \bold{*} quantifier is often used
354 in error where \bold{+} should be used. For example, if
355 \bold{\\s*$} is used in an expression to match strings that
356 end in whitespace, it will match every string because
357 \bold{\\s*$} means \e{Match zero or more whitespaces followed
358 by end of string}. The correct regexp to match strings that
359 have at least one trailing whitespace character is
360 \bold{\\s+$}.
361
362 \row \i \bold{\e {E}{n}}
363
364 \i Matches exactly \e n occurrences of \e E. \bold{\e {E}{n}}
365 is the same as repeating \e E \e n times. For example,
366 \bold{x{5}} is the same as \bold{xxxxx}. It is also the same
367 as \bold{\e {E}{n,n}}, e.g. \bold{x{5,5}}.
368
369 \row \i \bold{\e {E}{n,}}
370 \i Matches at least \e n occurrences of \e E.
371
372 \row \i \bold{\e {E}{,m}}
373 \i Matches at most \e m occurrences of \e E. \bold{\e {E}{,m}}
374 is the same as \bold{\e {E}{0,m}}.
375
376 \row \i \bold{\e {E}{n,m}}
377 \i Matches at least \e n and at most \e m occurrences of \e E.
378 \endtable
379
380 To apply a quantifier to more than just the preceding character,
381 use parentheses to group characters together in an expression. For
382 example, \bold{tag+} matches a 't' followed by an 'a' followed by
383 at least one 'g', whereas \bold{(tag)+} matches at least one
384 occurrence of 'tag'.
385
386 Note: Quantifiers are normally "greedy". They always match as much
387 text as they can. For example, \bold{0+} matches the first zero it
388 finds and all the consecutive zeros after the first zero. Applied
389 to '20005', it matches'2\underline{000}5'. Quantifiers can be made
390 non-greedy, see setMinimal().
391
392 \target capturing parentheses
393 \target backreferences
394 \section1 Capturing Text
395
396 Parentheses allow us to group elements together so that we can
397 quantify and capture them. For example if we have the expression
398 \bold{mail|letter|correspondence} that matches a string we know
399 that \e one of the words matched but not which one. Using
400 parentheses allows us to "capture" whatever is matched within
401 their bounds, so if we used \bold{(mail|letter|correspondence)}
402 and matched this regexp against the string "I sent you some email"
403 we can use the cap() or capturedTexts() functions to extract the
404 matched characters, in this case 'mail'.
405
406 We can use captured text within the regexp itself. To refer to the
407 captured text we use \e backreferences which are indexed from 1,
408 the same as for cap(). For example we could search for duplicate
409 words in a string using \bold{\\b(\\w+)\\W+\\1\\b} which means match a
410 word boundary followed by one or more word characters followed by
411 one or more non-word characters followed by the same text as the
412 first parenthesized expression followed by a word boundary.
413
414 If we want to use parentheses purely for grouping and not for
415 capturing we can use the non-capturing syntax, e.g.
416 \bold{(?:green|blue)}. Non-capturing parentheses begin '(?:' and
417 end ')'. In this example we match either 'green' or 'blue' but we
418 do not capture the match so we only know whether or not we matched
419 but not which color we actually found. Using non-capturing
420 parentheses is more efficient than using capturing parentheses
421 since the regexp engine has to do less book-keeping.
422
423 Both capturing and non-capturing parentheses may be nested.
424
425 \target greedy quantifiers
426
427 For historical reasons, quantifiers (e.g. \bold{*}) that apply to
428 capturing parentheses are more "greedy" than other quantifiers.
429 For example, \bold{a*(a)*} will match "aaa" with cap(1) == "aaa".
430 This behavior is different from what other regexp engines do
431 (notably, Perl). To obtain a more intuitive capturing behavior,
432 specify QRegExp::RegExp2 to the QRegExp constructor or call
433 setPatternSyntax(QRegExp::RegExp2).
434
435 \target cap_in_a_loop
436
437 When the number of matches cannot be determined in advance, a
438 common idiom is to use cap() in a loop. For example:
439
440 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 0
441
442 \target assertions
443 \section1 Assertions
444
445 Assertions make some statement about the text at the point where
446 they occur in the regexp but they do not match any characters. In
447 the following list \bold{\e {E}} stands for any expression.
448
449 \table
450 \row \i \bold{^}
451 \i The caret signifies the beginning of the string. If you
452 wish to match a literal \c{^} you must escape it by
453 writing \c{\\^}. For example, \bold{^#include} will only
454 match strings which \e begin with the characters '#include'.
455 (When the caret is the first character of a character set it
456 has a special meaning, see \link #sets-of-characters Sets of
457 Characters \endlink.)
458
459 \row \i \bold{$}
460 \i The dollar signifies the end of the string. For example
461 \bold{\\d\\s*$} will match strings which end with a digit
462 optionally followed by whitespace. If you wish to match a
463 literal \c{$} you must escape it by writing
464 \c{\\$}.
465
466 \row \i \bold{\\b}
467 \i A word boundary. For example the regexp
468 \bold{\\bOK\\b} means match immediately after a word
469 boundary (e.g. start of string or whitespace) the letter 'O'
470 then the letter 'K' immediately before another word boundary
471 (e.g. end of string or whitespace). But note that the
472 assertion does not actually match any whitespace so if we
473 write \bold{(\\bOK\\b)} and we have a match it will only
474 contain 'OK' even if the string is "It's \underline{OK} now".
475
476 \row \i \bold{\\B}
477 \i A non-word boundary. This assertion is true wherever
478 \bold{\\b} is false. For example if we searched for
479 \bold{\\Bon\\B} in "Left on" the match would fail (space
480 and end of string aren't non-word boundaries), but it would
481 match in "t\underline{on}ne".
482
483 \row \i \bold{(?=\e E)}
484 \i Positive lookahead. This assertion is true if the
485 expression matches at this point in the regexp. For example,
486 \bold{const(?=\\s+char)} matches 'const' whenever it is
487 followed by 'char', as in 'static \underline{const} char *'.
488 (Compare with \bold{const\\s+char}, which matches 'static
489 \underline{const char} *'.)
490
491 \row \i \bold{(?!\e E)}
492 \i Negative lookahead. This assertion is true if the
493 expression does not match at this point in the regexp. For
494 example, \bold{const(?!\\s+char)} matches 'const' \e except
495 when it is followed by 'char'.
496 \endtable
497
498 \keyword QRegExp wildcard matching
499 \section1 Wildcard Matching
500
501 Most command shells such as \e bash or \e cmd.exe support "file
502 globbing", the ability to identify a group of files by using
503 wildcards. The setPatternSyntax() function is used to switch
504 between regexp and wildcard mode. Wildcard matching is much
505 simpler than full regexps and has only four features:
506
507 \table
508 \row \i \bold{c}
509 \i Any character represents itself apart from those mentioned
510 below. Thus \bold{c} matches the character \e c.
511 \row \i \bold{?}
512 \i Matches any single character. It is the same as
513 \bold{.} in full regexps.
514 \row \i \bold{*}
515 \i Matches zero or more of any characters. It is the
516 same as \bold{.*} in full regexps.
517 \row \i \bold{[...]}
518 \i Sets of characters can be represented in square brackets,
519 similar to full regexps. Within the character class, like
520 outside, backslash has no special meaning.
521 \endtable
522
523 For example if we are in wildcard mode and have strings which
524 contain filenames we could identify HTML files with \bold{*.html}.
525 This will match zero or more characters followed by a dot followed
526 by 'h', 't', 'm' and 'l'.
527
528 To test a string against a wildcard expression, use exactMatch().
529 For example:
530
531 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 1
532
533 \target perl-users
534 \section1 Notes for Perl Users
535
536 Most of the character class abbreviations supported by Perl are
537 supported by QRegExp, see \link
538 #characters-and-abbreviations-for-sets-of-characters characters
539 and abbreviations for sets of characters \endlink.
540
541 In QRegExp, apart from within character classes, \c{^} always
542 signifies the start of the string, so carets must always be
543 escaped unless used for that purpose. In Perl the meaning of caret
544 varies automagically depending on where it occurs so escaping it
545 is rarely necessary. The same applies to \c{$} which in
546 QRegExp always signifies the end of the string.
547
548 QRegExp's quantifiers are the same as Perl's greedy quantifiers
549 (but see the \l{greedy quantifiers}{note above}). Non-greedy
550 matching cannot be applied to individual quantifiers, but can be
551 applied to all the quantifiers in the pattern. For example, to
552 match the Perl regexp \bold{ro+?m} requires:
553
554 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 2
555
556 The equivalent of Perl's \c{/i} option is
557 setCaseSensitivity(Qt::CaseInsensitive).
558
559 Perl's \c{/g} option can be emulated using a \l{#cap_in_a_loop}{loop}.
560
561 In QRegExp \bold{.} matches any character, therefore all QRegExp
562 regexps have the equivalent of Perl's \c{/s} option. QRegExp
563 does not have an equivalent to Perl's \c{/m} option, but this
564 can be emulated in various ways for example by splitting the input
565 into lines or by looping with a regexp that searches for newlines.
566
567 Because QRegExp is string oriented, there are no \\A, \\Z, or \\z
568 assertions. The \\G assertion is not supported but can be emulated
569 in a loop.
570
571 Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp
572 equivalents for $`, $' or $+. Perl's capturing variables, $1, $2,
573 ... correspond to cap(1) or capturedTexts()[1], cap(2) or
574 capturedTexts()[2], etc.
575
576 To substitute a pattern use QString::replace().
577
578 Perl's extended \c{/x} syntax is not supported, nor are
579 directives, e.g. (?i), or regexp comments, e.g. (?#comment). On
580 the other hand, C++'s rules for literal strings can be used to
581 achieve the same:
582
583 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 3
584
585 Both zero-width positive and zero-width negative lookahead
586 assertions (?=pattern) and (?!pattern) are supported with the same
587 syntax as Perl. Perl's lookbehind assertions, "independent"
588 subexpressions and conditional expressions are not supported.
589
590 Non-capturing parentheses are also supported, with the same
591 (?:pattern) syntax.
592
593 See QString::split() and QStringList::join() for equivalents
594 to Perl's split and join functions.
595
596 Note: because C++ transforms \\'s they must be written \e twice in
597 code, e.g. \bold{\\b} must be written \bold{\\\\b}.
598
599 \target code-examples
600 \section1 Code Examples
601
602 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 4
603
604 The third string matches '\underline{6}'. This is a simple validation
605 regexp for integers in the range 0 to 99.
606
607 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 5
608
609 The second string matches '\underline{This_is-OK}'. We've used the
610 character set abbreviation '\\S' (non-whitespace) and the anchors
611 to match strings which contain no whitespace.
612
613 In the following example we match strings containing 'mail' or
614 'letter' or 'correspondence' but only match whole words i.e. not
615 'email'
616
617 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 6
618
619 The second string matches "Please write the \underline{letter}". The
620 word 'letter' is also captured (because of the parentheses). We
621 can see what text we've captured like this:
622
623 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 7
624
625 This will capture the text from the first set of capturing
626 parentheses (counting capturing left parentheses from left to
627 right). The parentheses are counted from 1 since cap(0) is the
628 whole matched regexp (equivalent to '&' in most regexp engines).
629
630 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 8
631
632 Here we've passed the QRegExp to QString's replace() function to
633 replace the matched text with new text.
634
635 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 9
636
637 We've used the indexIn() function to repeatedly match the regexp in
638 the string. Note that instead of moving forward by one character
639 at a time \c pos++ we could have written \c {pos +=
640 rx.matchedLength()} to skip over the already matched string. The
641 count will equal 3, matching 'One \underline{Eric} another
642 \underline{Eirik}, and an Ericsson. How many Eiriks, \underline{Eric}?'; it
643 doesn't match 'Ericsson' or 'Eiriks' because they are not bounded
644 by non-word boundaries.
645
646 One common use of regexps is to split lines of delimited data into
647 their component fields.
648
649 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 10
650
651 In this example our input lines have the format company name, web
652 address and country. Unfortunately the regexp is rather long and
653 not very versatile -- the code will break if we add any more
654 fields. A simpler and better solution is to look for the
655 separator, '\\t' in this case, and take the surrounding text. The
656 QString::split() function can take a separator string or regexp
657 as an argument and split a string accordingly.
658
659 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 11
660
661 Here field[0] is the company, field[1] the web address and so on.
662
663 To imitate the matching of a shell we can use wildcard mode.
664
665 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 12
666
667 Wildcard matching can be convenient because of its simplicity, but
668 any wildcard regexp can be defined using full regexps, e.g.
669 \bold{.*\.html$}. Notice that we can't match both \c .html and \c
670 .htm files with a wildcard unless we use \bold{*.htm*} which will
671 also match 'test.html.bak'. A full regexp gives us the precision
672 we need, \bold{.*\\.html?$}.
673
674 QRegExp can match case insensitively using setCaseSensitivity(),
675 and can use non-greedy matching, see setMinimal(). By
676 default QRegExp uses full regexps but this can be changed with
677 setWildcard(). Searching can be forward with indexIn() or backward
678 with lastIndexIn(). Captured text can be accessed using
679 capturedTexts() which returns a string list of all captured
680 strings, or using cap() which returns the captured string for the
681 given index. The pos() function takes a match index and returns
682 the position in the string where the match was made (or -1 if
683 there was no match).
684
685 \sa QString, QStringList, QRegExpValidator, QSortFilterProxyModel,
686 {tools/regexp}{Regular Expression Example}
687*/
688
689const int NumBadChars = 64;
690#define BadChar(ch) ((ch).unicode() % NumBadChars)
691
692const int NoOccurrence = INT_MAX;
693const int EmptyCapture = INT_MAX;
694const int InftyLen = INT_MAX;
695const int InftyRep = 1025;
696const int EOS = -1;
697
698static bool isWord(QChar ch)
699{
700 return ch.isLetterOrNumber() || ch.isMark() || ch == QLatin1Char('_');
701}
702
703/*
704 Merges two vectors of ints and puts the result into the first
705 one.
706*/
707static void mergeInto(QVector<int> *a, const QVector<int> &b)
708{
709 int asize = a->size();
710 int bsize = b.size();
711 if (asize == 0) {
712 *a = b;
713#ifndef QT_NO_REGEXP_OPTIM
714 } else if (bsize == 1 && a->at(asize - 1) < b.at(0)) {
715 a->resize(asize + 1);
716 (*a)[asize] = b.at(0);
717#endif
718 } else if (bsize >= 1) {
719 int csize = asize + bsize;
720 QVector<int> c(csize);
721 int i = 0, j = 0, k = 0;
722 while (i < asize) {
723 if (j < bsize) {
724 if (a->at(i) == b.at(j)) {
725 ++i;
726 --csize;
727 } else if (a->at(i) < b.at(j)) {
728 c[k++] = a->at(i++);
729 } else {
730 c[k++] = b.at(j++);
731 }
732 } else {
733 memcpy(c.data() + k, a->constData() + i, (asize - i) * sizeof(int));
734 break;
735 }
736 }
737 c.resize(csize);
738 if (j < bsize)
739 memcpy(c.data() + k, b.constData() + j, (bsize - j) * sizeof(int));
740 *a = c;
741 }
742}
743
744#ifndef QT_NO_REGEXP_WILDCARD
745/*
746 Translates a wildcard pattern to an equivalent regular expression
747 pattern (e.g., *.cpp to .*\.cpp).
748*/
749static QString wc2rx(const QString &wc_str)
750{
751 int wclen = wc_str.length();
752 QString rx;
753 int i = 0;
754 const QChar *wc = wc_str.unicode();
755 while (i < wclen) {
756 QChar c = wc[i++];
757 switch (c.unicode()) {
758 case '*':
759 rx += QLatin1String(".*");
760 break;
761 case '?':
762 rx += QLatin1Char('.');
763 break;
764 case '$':
765 case '(':
766 case ')':
767 case '+':
768 case '.':
769 case '\\':
770 case '^':
771 case '{':
772 case '|':
773 case '}':
774 rx += QLatin1Char('\\');
775 rx += c;
776 break;
777 case '[':
778 rx += c;
779 if (wc[i] == QLatin1Char('^'))
780 rx += wc[i++];
781 if (i < wclen) {
782 if (rx[i] == QLatin1Char(']'))
783 rx += wc[i++];
784 while (i < wclen && wc[i] != QLatin1Char(']')) {
785 if (wc[i] == QLatin1Char('\\'))
786 rx += QLatin1Char('\\');
787 rx += wc[i++];
788 }
789 }
790 break;
791 default:
792 rx += c;
793 }
794 }
795 return rx;
796}
797#endif
798
799static int caretIndex(int offset, QRegExp::CaretMode caretMode)
800{
801 if (caretMode == QRegExp::CaretAtZero) {
802 return 0;
803 } else if (caretMode == QRegExp::CaretAtOffset) {
804 return offset;
805 } else { // QRegExp::CaretWontMatch
806 return -1;
807 }
808}
809
810/*
811 The QRegExpEngineKey struct uniquely identifies an engine.
812*/
813struct QRegExpEngineKey
814{
815 QString pattern;
816 QRegExp::PatternSyntax patternSyntax;
817 Qt::CaseSensitivity cs;
818
819 inline QRegExpEngineKey(const QString &pattern, QRegExp::PatternSyntax patternSyntax,
820 Qt::CaseSensitivity cs)
821 : pattern(pattern), patternSyntax(patternSyntax), cs(cs) {}
822
823 inline void clear() {
824 pattern.clear();
825 patternSyntax = QRegExp::RegExp;
826 cs = Qt::CaseSensitive;
827 }
828};
829
830bool operator==(const QRegExpEngineKey &key1, const QRegExpEngineKey &key2)
831{
832 return key1.pattern == key2.pattern && key1.patternSyntax == key2.patternSyntax
833 && key1.cs == key2.cs;
834}
835
836class QRegExpEngine;
837
838//Q_DECLARE_TYPEINFO(QVector<int>, Q_MOVABLE_TYPE);
839
840/*
841 This is the engine state during matching.
842*/
843struct QRegExpMatchState
844{
845 const QChar *in; // a pointer to the input string data
846 int pos; // the current position in the string
847 int caretPos;
848 int len; // the length of the input string
849 bool minimal; // minimal matching?
850 int *bigArray; // big array holding the data for the next pointers
851 int *inNextStack; // is state is nextStack?
852 int *curStack; // stack of current states
853 int *nextStack; // stack of next states
854 int *curCapBegin; // start of current states' captures
855 int *nextCapBegin; // start of next states' captures
856 int *curCapEnd; // end of current states' captures
857 int *nextCapEnd; // end of next states' captures
858 int *tempCapBegin; // start of temporary captures
859 int *tempCapEnd; // end of temporary captures
860 int *capBegin; // start of captures for a next state
861 int *capEnd; // end of captures for a next state
862 int *slideTab; // bump-along slide table for bad-character heuristic
863 int *captured; // what match() returned last
864 int slideTabSize; // size of slide table
865 int capturedSize;
866#ifndef QT_NO_REGEXP_BACKREF
867 QList<QVector<int> > sleeping; // list of back-reference sleepers
868#endif
869 int matchLen; // length of match
870 int oneTestMatchedLen; // length of partial match
871
872 const QRegExpEngine *eng;
873
874 inline QRegExpMatchState() : bigArray(0), captured(0) {}
875 inline ~QRegExpMatchState() { free(bigArray); }
876
877 void drain() { free(bigArray); bigArray = 0; captured = 0; } // to save memory
878 void prepareForMatch(QRegExpEngine *eng);
879 void match(const QChar *str, int len, int pos, bool minimal,
880 bool oneTest, int caretIndex);
881 bool matchHere();
882 bool testAnchor(int i, int a, const int *capBegin);
883};
884
885/*
886 The struct QRegExpAutomatonState represents one state in a modified NFA. The
887 input characters matched are stored in the state instead of on
888 the transitions, something possible for an automaton
889 constructed from a regular expression.
890*/
891struct QRegExpAutomatonState
892{
893#ifndef QT_NO_REGEXP_CAPTURE
894 int atom; // which atom does this state belong to?
895#endif
896 int match; // what does it match? (see CharClassBit and BackRefBit)
897 QVector<int> outs; // out-transitions
898 QMap<int, int> reenter; // atoms reentered when transiting out
899 QMap<int, int> anchors; // anchors met when transiting out
900
901 inline QRegExpAutomatonState() { }
902#ifndef QT_NO_REGEXP_CAPTURE
903 inline QRegExpAutomatonState(int a, int m)
904 : atom(a), match(m) { }
905#else
906 inline QRegExpAutomatonState(int m)
907 : match(m) { }
908#endif
909};
910
911Q_DECLARE_TYPEINFO(QRegExpAutomatonState, Q_MOVABLE_TYPE);
912
913/*
914 The struct QRegExpCharClassRange represents a range of characters (e.g.,
915 [0-9] denotes range 48 to 57).
916*/
917struct QRegExpCharClassRange
918{
919 ushort from; // 48
920 ushort len; // 10
921};
922
923Q_DECLARE_TYPEINFO(QRegExpCharClassRange, Q_PRIMITIVE_TYPE);
924
925#ifndef QT_NO_REGEXP_CAPTURE
926/*
927 The struct QRegExpAtom represents one node in the hierarchy of regular
928 expression atoms.
929*/
930struct QRegExpAtom
931{
932 enum { NoCapture = -1, OfficialCapture = -2, UnofficialCapture = -3 };
933
934 int parent; // index of parent in array of atoms
935 int capture; // index of capture, from 1 to ncap - 1
936};
937
938Q_DECLARE_TYPEINFO(QRegExpAtom, Q_PRIMITIVE_TYPE);
939#endif
940
941struct QRegExpLookahead;
942
943#ifndef QT_NO_REGEXP_ANCHOR_ALT
944/*
945 The struct QRegExpAnchorAlternation represents a pair of anchors with
946 OR semantics.
947*/
948struct QRegExpAnchorAlternation
949{
950 int a; // this anchor...
951 int b; // ...or this one
952};
953
954Q_DECLARE_TYPEINFO(QRegExpAnchorAlternation, Q_PRIMITIVE_TYPE);
955#endif
956
957#ifndef QT_NO_REGEXP_CCLASS
958/*
959 The class QRegExpCharClass represents a set of characters, such as can
960 be found in regular expressions (e.g., [a-z] denotes the set
961 {a, b, ..., z}).
962*/
963class QRegExpCharClass
964{
965public:
966 QRegExpCharClass();
967 inline QRegExpCharClass(const QRegExpCharClass &cc) { operator=(cc); }
968
969 QRegExpCharClass &operator=(const QRegExpCharClass &cc);
970
971 void clear();
972 bool negative() const { return n; }
973 void setNegative(bool negative);
974 void addCategories(int cats);
975 void addRange(ushort from, ushort to);
976 void addSingleton(ushort ch) { addRange(ch, ch); }
977
978 bool in(QChar ch) const;
979#ifndef QT_NO_REGEXP_OPTIM
980 const QVector<int> &firstOccurrence() const { return occ1; }
981#endif
982
983#if defined(QT_DEBUG)
984 void dump() const;
985#endif
986
987private:
988 int c; // character classes
989 QVector<QRegExpCharClassRange> r; // character ranges
990 bool n; // negative?
991#ifndef QT_NO_REGEXP_OPTIM
992 QVector<int> occ1; // first-occurrence array
993#endif
994};
995#else
996struct QRegExpCharClass
997{
998 int dummy;
999
1000#ifndef QT_NO_REGEXP_OPTIM
1001 QRegExpCharClass() { occ1.fill(0, NumBadChars); }
1002
1003 const QVector<int> &firstOccurrence() const { return occ1; }
1004 QVector<int> occ1;
1005#endif
1006};
1007#endif
1008
1009Q_DECLARE_TYPEINFO(QRegExpCharClass, Q_MOVABLE_TYPE);
1010
1011/*
1012 The QRegExpEngine class encapsulates a modified nondeterministic
1013 finite automaton (NFA).
1014*/
1015class QRegExpEngine
1016{
1017public:
1018 QRegExpEngine(Qt::CaseSensitivity cs, bool greedyQuantifiers)
1019 : cs(cs), greedyQuantifiers(greedyQuantifiers) { setup(); }
1020
1021 QRegExpEngine(const QRegExpEngineKey &key);
1022 ~QRegExpEngine();
1023
1024 bool isValid() const { return valid; }
1025 const QString &errorString() const { return yyError; }
1026 int numCaptures() const { return officialncap; }
1027
1028 int createState(QChar ch);
1029 int createState(const QRegExpCharClass &cc);
1030#ifndef QT_NO_REGEXP_BACKREF
1031 int createState(int bref);
1032#endif
1033
1034 void addCatTransitions(const QVector<int> &from, const QVector<int> &to);
1035#ifndef QT_NO_REGEXP_CAPTURE
1036 void addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom);
1037#endif
1038
1039#ifndef QT_NO_REGEXP_ANCHOR_ALT
1040 int anchorAlternation(int a, int b);
1041 int anchorConcatenation(int a, int b);
1042#else
1043 int anchorAlternation(int a, int b) { return a & b; }
1044 int anchorConcatenation(int a, int b) { return a | b; }
1045#endif
1046 void addAnchors(int from, int to, int a);
1047
1048#ifndef QT_NO_REGEXP_OPTIM
1049 void heuristicallyChooseHeuristic();
1050#endif
1051
1052#if defined(QT_DEBUG)
1053 void dump() const;
1054#endif
1055
1056 QAtomicInt ref;
1057
1058private:
1059 enum { CharClassBit = 0x10000, BackRefBit = 0x20000 };
1060 enum { InitialState = 0, FinalState = 1 };
1061
1062 void setup();
1063 int setupState(int match);
1064
1065 /*
1066 Let's hope that 13 lookaheads and 14 back-references are
1067 enough.
1068 */
1069 enum { MaxLookaheads = 13, MaxBackRefs = 14 };
1070 enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, Anchor_Word = 0x00000004,
1071 Anchor_NonWord = 0x00000008, Anchor_FirstLookahead = 0x00000010,
1072 Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads,
1073 Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1,
1074 Anchor_Alternation = unsigned(Anchor_BackRef1Empty) << MaxBackRefs,
1075
1076 Anchor_LookaheadMask = (Anchor_FirstLookahead - 1) ^
1077 ((Anchor_FirstLookahead << MaxLookaheads) - 1) };
1078#ifndef QT_NO_REGEXP_CAPTURE
1079 int startAtom(bool officialCapture);
1080 void finishAtom(int atom, bool needCapture);
1081#endif
1082
1083#ifndef QT_NO_REGEXP_LOOKAHEAD
1084 int addLookahead(QRegExpEngine *eng, bool negative);
1085#endif
1086
1087#ifndef QT_NO_REGEXP_OPTIM
1088 bool goodStringMatch(QRegExpMatchState &matchState) const;
1089 bool badCharMatch(QRegExpMatchState &matchState) const;
1090#else
1091 bool bruteMatch(QRegExpMatchState &matchState) const;
1092#endif
1093
1094 QVector<QRegExpAutomatonState> s; // array of states
1095#ifndef QT_NO_REGEXP_CAPTURE
1096 QVector<QRegExpAtom> f; // atom hierarchy
1097 int nf; // number of atoms
1098 int cf; // current atom
1099 QVector<int> captureForOfficialCapture;
1100#endif
1101 int officialncap; // number of captures, seen from the outside
1102 int ncap; // number of captures, seen from the inside
1103#ifndef QT_NO_REGEXP_CCLASS
1104 QVector<QRegExpCharClass> cl; // array of character classes
1105#endif
1106#ifndef QT_NO_REGEXP_LOOKAHEAD
1107 QVector<QRegExpLookahead *> ahead; // array of lookaheads
1108#endif
1109#ifndef QT_NO_REGEXP_ANCHOR_ALT
1110 QVector<QRegExpAnchorAlternation> aa; // array of (a, b) pairs of anchors
1111#endif
1112#ifndef QT_NO_REGEXP_OPTIM
1113 bool caretAnchored; // does the regexp start with ^?
1114 bool trivial; // is the good-string all that needs to match?
1115#endif
1116 bool valid; // is the regular expression valid?
1117 Qt::CaseSensitivity cs; // case sensitive?
1118 bool greedyQuantifiers; // RegExp2?
1119#ifndef QT_NO_REGEXP_BACKREF
1120 int nbrefs; // number of back-references
1121#endif
1122
1123#ifndef QT_NO_REGEXP_OPTIM
1124 bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch
1125
1126 int goodEarlyStart; // the index where goodStr can first occur in a match
1127 int goodLateStart; // the index where goodStr can last occur in a match
1128 QString goodStr; // the string that any match has to contain
1129
1130 int minl; // the minimum length of a match
1131 QVector<int> occ1; // first-occurrence array
1132#endif
1133
1134 /*
1135 The class Box is an abstraction for a regular expression
1136 fragment. It can also be seen as one node in the syntax tree of
1137 a regular expression with synthetized attributes.
1138
1139 Its interface is ugly for performance reasons.
1140 */
1141 class Box
1142 {
1143 public:
1144 Box(QRegExpEngine *engine);
1145 Box(const Box &b) { operator=(b); }
1146
1147 Box &operator=(const Box &b);
1148
1149 void clear() { operator=(Box(eng)); }
1150 void set(QChar ch);
1151 void set(const QRegExpCharClass &cc);
1152#ifndef QT_NO_REGEXP_BACKREF
1153 void set(int bref);
1154#endif
1155
1156 void cat(const Box &b);
1157 void orx(const Box &b);
1158 void plus(int atom);
1159 void opt();
1160 void catAnchor(int a);
1161#ifndef QT_NO_REGEXP_OPTIM
1162 void setupHeuristics();
1163#endif
1164
1165#if defined(QT_DEBUG)
1166 void dump() const;
1167#endif
1168
1169 private:
1170 void addAnchorsToEngine(const Box &to) const;
1171
1172 QRegExpEngine *eng; // the automaton under construction
1173 QVector<int> ls; // the left states (firstpos)
1174 QVector<int> rs; // the right states (lastpos)
1175 QMap<int, int> lanchors; // the left anchors
1176 QMap<int, int> ranchors; // the right anchors
1177 int skipanchors; // the anchors to match if the box is skipped
1178
1179#ifndef QT_NO_REGEXP_OPTIM
1180 int earlyStart; // the index where str can first occur
1181 int lateStart; // the index where str can last occur
1182 QString str; // a string that has to occur in any match
1183 QString leftStr; // a string occurring at the left of this box
1184 QString rightStr; // a string occurring at the right of this box
1185 int maxl; // the maximum length of this box (possibly InftyLen)
1186#endif
1187
1188 int minl; // the minimum length of this box
1189#ifndef QT_NO_REGEXP_OPTIM
1190 QVector<int> occ1; // first-occurrence array
1191#endif
1192 };
1193
1194 friend class Box;
1195
1196 /*
1197 This is the lexical analyzer for regular expressions.
1198 */
1199 enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, Tok_PosLookahead,
1200 Tok_NegLookahead, Tok_RightParen, Tok_CharClass, Tok_Caret, Tok_Quantifier, Tok_Bar,
1201 Tok_Word, Tok_NonWord, Tok_Char = 0x10000, Tok_BackRef = 0x20000 };
1202 int getChar();
1203 int getEscape();
1204#ifndef QT_NO_REGEXP_INTERVAL
1205 int getRep(int def);
1206#endif
1207#ifndef QT_NO_REGEXP_LOOKAHEAD
1208 void skipChars(int n);
1209#endif
1210 void error(const char *msg);
1211 void startTokenizer(const QChar *rx, int len);
1212 int getToken();
1213
1214 const QChar *yyIn; // a pointer to the input regular expression pattern
1215 int yyPos0; // the position of yyTok in the input pattern
1216 int yyPos; // the position of the next character to read
1217 int yyLen; // the length of yyIn
1218 int yyCh; // the last character read
1219 QRegExpCharClass *yyCharClass; // attribute for Tok_CharClass tokens
1220 int yyMinRep; // attribute for Tok_Quantifier
1221 int yyMaxRep; // ditto
1222 QString yyError; // syntax error or overflow during parsing?
1223
1224 /*
1225 This is the syntactic analyzer for regular expressions.
1226 */
1227 int parse(const QChar *rx, int len);
1228 void parseAtom(Box *box);
1229 void parseFactor(Box *box);
1230 void parseTerm(Box *box);
1231 void parseExpression(Box *box);
1232
1233 int yyTok; // the last token read
1234 bool yyMayCapture; // set this to false to disable capturing
1235
1236 friend struct QRegExpMatchState;
1237};
1238
1239#ifndef QT_NO_REGEXP_LOOKAHEAD
1240/*
1241 The struct QRegExpLookahead represents a lookahead a la Perl (e.g.,
1242 (?=foo) and (?!bar)).
1243*/
1244struct QRegExpLookahead
1245{
1246 QRegExpEngine *eng; // NFA representing the embedded regular expression
1247 bool neg; // negative lookahead?
1248
1249 inline QRegExpLookahead(QRegExpEngine *eng0, bool neg0)
1250 : eng(eng0), neg(neg0) { }
1251 inline ~QRegExpLookahead() { delete eng; }
1252};
1253#endif
1254
1255QRegExpEngine::QRegExpEngine(const QRegExpEngineKey &key)
1256 : cs(key.cs), greedyQuantifiers(key.patternSyntax == QRegExp::RegExp2)
1257{
1258 setup();
1259
1260 QString rx;
1261
1262 switch (key.patternSyntax) {
1263 case QRegExp::Wildcard:
1264#ifndef QT_NO_REGEXP_WILDCARD
1265 rx = wc2rx(key.pattern);
1266#endif
1267 break;
1268 case QRegExp::FixedString:
1269 rx = QRegExp::escape(key.pattern);
1270 break;
1271 default:
1272 rx = key.pattern;
1273 }
1274
1275 valid = (parse(rx.unicode(), rx.length()) == rx.length());
1276 if (!valid) {
1277#ifndef QT_NO_REGEXP_OPTIM
1278 trivial = false;
1279#endif
1280 error(RXERR_LEFTDELIM);
1281 }
1282}
1283
1284QRegExpEngine::~QRegExpEngine()
1285{
1286#ifndef QT_NO_REGEXP_LOOKAHEAD
1287 qDeleteAll(ahead);
1288#endif
1289}
1290
1291void QRegExpMatchState::prepareForMatch(QRegExpEngine *eng)
1292{
1293 /*
1294 We use one QVector<int> for all the big data used a lot in
1295 matchHere() and friends.
1296 */
1297 int ns = eng->s.size(); // number of states
1298 int ncap = eng->ncap;
1299#ifndef QT_NO_REGEXP_OPTIM
1300 slideTabSize = qMax(eng->minl + 1, 16);
1301#else
1302 slideTabSize = 0;
1303#endif
1304 int numCaptures = eng->numCaptures();
1305 capturedSize = 2 + 2 * numCaptures;
1306 bigArray = (int *)realloc(bigArray, ((3 + 4 * ncap) * ns + 4 * ncap + slideTabSize + capturedSize)*sizeof(int));
1307
1308 inNextStack = bigArray;
1309 memset(inNextStack, -1, ns * sizeof(int));
1310 curStack = inNextStack + ns;
1311 nextStack = inNextStack + 2 * ns;
1312
1313 curCapBegin = inNextStack + 3 * ns;
1314 nextCapBegin = curCapBegin + ncap * ns;
1315 curCapEnd = curCapBegin + 2 * ncap * ns;
1316 nextCapEnd = curCapBegin + 3 * ncap * ns;
1317
1318 tempCapBegin = curCapBegin + 4 * ncap * ns;
1319 tempCapEnd = tempCapBegin + ncap;
1320 capBegin = tempCapBegin + 2 * ncap;
1321 capEnd = tempCapBegin + 3 * ncap;
1322
1323 slideTab = tempCapBegin + 4 * ncap;
1324 captured = slideTab + slideTabSize;
1325 memset(captured, -1, capturedSize*sizeof(int));
1326 this->eng = eng;
1327}
1328
1329/*
1330 Tries to match in str and returns an array of (begin, length) pairs
1331 for captured text. If there is no match, all pairs are (-1, -1).
1332*/
1333void QRegExpMatchState::match(const QChar *str0, int len0, int pos0,
1334 bool minimal0, bool oneTest, int caretIndex)
1335{
1336 bool matched = false;
1337 QChar char_null;
1338
1339#ifndef QT_NO_REGEXP_OPTIM
1340 if (eng->trivial && !oneTest) {
1341 pos = qFindString(str0, len0, pos0, eng->goodStr.unicode(), eng->goodStr.length(), eng->cs);
1342 matchLen = eng->goodStr.length();
1343 matched = (pos != -1);
1344 } else
1345#endif
1346 {
1347 in = str0;
1348 if (in == 0)
1349 in = &char_null;
1350 pos = pos0;
1351 caretPos = caretIndex;
1352 len = len0;
1353 minimal = minimal0;
1354 matchLen = 0;
1355 oneTestMatchedLen = 0;
1356
1357 if (eng->valid && pos >= 0 && pos <= len) {
1358#ifndef QT_NO_REGEXP_OPTIM
1359 if (oneTest) {
1360 matched = matchHere();
1361 } else {
1362 if (pos <= len - eng->minl) {
1363 if (eng->caretAnchored) {
1364 matched = matchHere();
1365 } else if (eng->useGoodStringHeuristic) {
1366 matched = eng->goodStringMatch(*this);
1367 } else {
1368 matched = eng->badCharMatch(*this);
1369 }
1370 }
1371 }
1372#else
1373 matched = oneTest ? matchHere() : eng->bruteMatch(*this);
1374#endif
1375 }
1376 }
1377
1378 if (matched) {
1379 int *c = captured;
1380 *c++ = pos;
1381 *c++ = matchLen;
1382
1383 int numCaptures = (capturedSize - 2) >> 1;
1384#ifndef QT_NO_REGEXP_CAPTURE
1385 for (int i = 0; i < numCaptures; ++i) {
1386 int j = eng->captureForOfficialCapture.at(i);
1387 int len = capEnd[j] - capBegin[j];
1388 *c++ = (len > 0) ? pos + capBegin[j] : 0;
1389 *c++ = len;
1390 }
1391#endif
1392 } else {
1393 // we rely on 2's complement here
1394 memset(captured, -1, capturedSize * sizeof(int));
1395 }
1396}
1397
1398/*
1399 The three following functions add one state to the automaton and
1400 return the number of the state.
1401*/
1402
1403int QRegExpEngine::createState(QChar ch)
1404{
1405 return setupState(ch.unicode());
1406}
1407
1408int QRegExpEngine::createState(const QRegExpCharClass &cc)
1409{
1410#ifndef QT_NO_REGEXP_CCLASS
1411 int n = cl.size();
1412 cl += QRegExpCharClass(cc);
1413 return setupState(CharClassBit | n);
1414#else
1415 Q_UNUSED(cc);
1416 return setupState(CharClassBit);
1417#endif
1418}
1419
1420#ifndef QT_NO_REGEXP_BACKREF
1421int QRegExpEngine::createState(int bref)
1422{
1423 if (bref > nbrefs) {
1424 nbrefs = bref;
1425 if (nbrefs > MaxBackRefs) {
1426 error(RXERR_LIMIT);
1427 return 0;
1428 }
1429 }
1430 return setupState(BackRefBit | bref);
1431}
1432#endif
1433
1434/*
1435 The two following functions add a transition between all pairs of
1436 states (i, j) where i is found in from, and j is found in to.
1437
1438 Cat-transitions are distinguished from plus-transitions for
1439 capturing.
1440*/
1441
1442void QRegExpEngine::addCatTransitions(const QVector<int> &from, const QVector<int> &to)
1443{
1444 for (int i = 0; i < from.size(); i++)
1445 mergeInto(&s[from.at(i)].outs, to);
1446}
1447
1448#ifndef QT_NO_REGEXP_CAPTURE
1449void QRegExpEngine::addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom)
1450{
1451 for (int i = 0; i < from.size(); i++) {
1452 QRegExpAutomatonState &st = s[from.at(i)];
1453 const QVector<int> oldOuts = st.outs;
1454 mergeInto(&st.outs, to);
1455 if (f.at(atom).capture != QRegExpAtom::NoCapture) {
1456 for (int j = 0; j < to.size(); j++) {
1457 // ### st.reenter.contains(to.at(j)) check looks suspicious
1458 if (!st.reenter.contains(to.at(j)) &&
1459 qBinaryFind(oldOuts.constBegin(), oldOuts.constEnd(), to.at(j)) == oldOuts.end())
1460 st.reenter.insert(to.at(j), atom);
1461 }
1462 }
1463 }
1464}
1465#endif
1466
1467#ifndef QT_NO_REGEXP_ANCHOR_ALT
1468/*
1469 Returns an anchor that means a OR b.
1470*/
1471int QRegExpEngine::anchorAlternation(int a, int b)
1472{
1473 if (((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0)
1474 return a & b;
1475
1476 int n = aa.size();
1477#ifndef QT_NO_REGEXP_OPTIM
1478 if (n > 0 && aa.at(n - 1).a == a && aa.at(n - 1).b == b)
1479 return Anchor_Alternation | (n - 1);
1480#endif
1481
1482 aa.resize(n + 1);
1483 aa[n].a = a;
1484 aa[n].b = b;
1485 return Anchor_Alternation | n;
1486}
1487
1488/*
1489 Returns an anchor that means a AND b.
1490*/
1491int QRegExpEngine::anchorConcatenation(int a, int b)
1492{
1493 if (((a | b) & Anchor_Alternation) == 0)
1494 return a | b;
1495 if ((b & Anchor_Alternation) != 0)
1496 qSwap(a, b);
1497
1498 int aprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).a, b);
1499 int bprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).b, b);
1500 return anchorAlternation(aprime, bprime);
1501}
1502#endif
1503
1504/*
1505 Adds anchor a on a transition caracterised by its from state and
1506 its to state.
1507*/
1508void QRegExpEngine::addAnchors(int from, int to, int a)
1509{
1510 QRegExpAutomatonState &st = s[from];
1511 if (st.anchors.contains(to))
1512 a = anchorAlternation(st.anchors.value(to), a);
1513 st.anchors.insert(to, a);
1514}
1515
1516#ifndef QT_NO_REGEXP_OPTIM
1517/*
1518 This function chooses between the good-string and the bad-character
1519 heuristics. It computes two scores and chooses the heuristic with
1520 the highest score.
1521
1522 Here are some common-sense constraints on the scores that should be
1523 respected if the formulas are ever modified: (1) If goodStr is
1524 empty, the good-string heuristic scores 0. (2) If the regular
1525 expression is trivial, the good-string heuristic should be used.
1526 (3) If the search is case insensitive, the good-string heuristic
1527 should be used, unless it scores 0. (Case insensitivity turns all
1528 entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is
1529 big, the good-string heuristic should score less.
1530*/
1531void QRegExpEngine::heuristicallyChooseHeuristic()
1532{
1533 if (minl == 0) {
1534 useGoodStringHeuristic = false;
1535 } else if (trivial) {
1536 useGoodStringHeuristic = true;
1537 } else {
1538 /*
1539 Magic formula: The good string has to constitute a good
1540 proportion of the minimum-length string, and appear at a
1541 more-or-less known index.
1542 */
1543 int goodStringScore = (64 * goodStr.length() / minl) -
1544 (goodLateStart - goodEarlyStart);
1545 /*
1546 Less magic formula: We pick some characters at random, and
1547 check whether they are good or bad.
1548 */
1549 int badCharScore = 0;
1550 int step = qMax(1, NumBadChars / 32);
1551 for (int i = 1; i < NumBadChars; i += step) {
1552 if (occ1.at(i) == NoOccurrence)
1553 badCharScore += minl;
1554 else
1555 badCharScore += occ1.at(i);
1556 }
1557 badCharScore /= minl;
1558 useGoodStringHeuristic = (goodStringScore > badCharScore);
1559 }
1560}
1561#endif
1562
1563#if defined(QT_DEBUG)
1564void QRegExpEngine::dump() const
1565{
1566 int i, j;
1567 qDebug("Case %ssensitive engine", cs ? "" : "in");
1568 qDebug(" States");
1569 for (i = 0; i < s.size(); i++) {
1570 qDebug(" %d%s", i, i == InitialState ? " (initial)" : i == FinalState ? " (final)" : "");
1571#ifndef QT_NO_REGEXP_CAPTURE
1572 if (nf > 0)
1573 qDebug(" in atom %d", s[i].atom);
1574#endif
1575 int m = s[i].match;
1576 if ((m & CharClassBit) != 0) {
1577 qDebug(" match character class %d", m ^ CharClassBit);
1578#ifndef QT_NO_REGEXP_CCLASS
1579 cl[m ^ CharClassBit].dump();
1580#else
1581 qDebug(" negative character class");
1582#endif
1583 } else if ((m & BackRefBit) != 0) {
1584 qDebug(" match back-reference %d", m ^ BackRefBit);
1585 } else if (m >= 0x20 && m <= 0x7e) {
1586 qDebug(" match 0x%.4x (%c)", m, m);
1587 } else {
1588 qDebug(" match 0x%.4x", m);
1589 }
1590 for (j = 0; j < s[i].outs.size(); j++) {
1591 int next = s[i].outs[j];
1592 qDebug(" -> %d", next);
1593 if (s[i].reenter.contains(next))
1594 qDebug(" [reenter %d]", s[i].reenter[next]);
1595 if (s[i].anchors.value(next) != 0)
1596 qDebug(" [anchors 0x%.8x]", s[i].anchors[next]);
1597 }
1598 }
1599#ifndef QT_NO_REGEXP_CAPTURE
1600 if (nf > 0) {
1601 qDebug(" Atom Parent Capture");
1602 for (i = 0; i < nf; i++) {
1603 if (f[i].capture == QRegExpAtom::NoCapture) {
1604 qDebug(" %6d %6d nil", i, f[i].parent);
1605 } else {
1606 int cap = f[i].capture;
1607 bool official = captureForOfficialCapture.contains(cap);
1608 qDebug(" %6d %6d %6d %s", i, f[i].parent, f[i].capture,
1609 official ? "official" : "");
1610 }
1611 }
1612 }
1613#endif
1614#ifndef QT_NO_REGEXP_ANCHOR_ALT
1615 for (i = 0; i < aa.size(); i++)
1616 qDebug(" Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a, aa[i].b);
1617#endif
1618}
1619#endif
1620
1621void QRegExpEngine::setup()
1622{
1623 ref = 1;
1624#ifndef QT_NO_REGEXP_CAPTURE
1625 f.resize(32);
1626 nf = 0;
1627 cf = -1;
1628#endif
1629 officialncap = 0;
1630 ncap = 0;
1631#ifndef QT_NO_REGEXP_OPTIM
1632 caretAnchored = true;
1633 trivial = true;
1634#endif
1635 valid = false;
1636#ifndef QT_NO_REGEXP_BACKREF
1637 nbrefs = 0;
1638#endif
1639#ifndef QT_NO_REGEXP_OPTIM
1640 useGoodStringHeuristic = true;
1641 minl = 0;
1642 occ1.fill(0, NumBadChars);
1643#endif
1644}
1645
1646int QRegExpEngine::setupState(int match)
1647{
1648#ifndef QT_NO_REGEXP_CAPTURE
1649 s += QRegExpAutomatonState(cf, match);
1650#else
1651 s += QRegExpAutomatonState(match);
1652#endif
1653 return s.size() - 1;
1654}
1655
1656#ifndef QT_NO_REGEXP_CAPTURE
1657/*
1658 Functions startAtom() and finishAtom() should be called to delimit
1659 atoms. When a state is created, it is assigned to the current atom.
1660 The information is later used for capturing.
1661*/
1662int QRegExpEngine::startAtom(bool officialCapture)
1663{
1664 if ((nf & (nf + 1)) == 0 && nf + 1 >= f.size())
1665 f.resize((nf + 1) << 1);
1666 f[nf].parent = cf;
1667 cf = nf++;
1668 f[cf].capture = officialCapture ? QRegExpAtom::OfficialCapture : QRegExpAtom::NoCapture;
1669 return cf;
1670}
1671
1672void QRegExpEngine::finishAtom(int atom, bool needCapture)
1673{
1674 if (greedyQuantifiers && needCapture && f[atom].capture == QRegExpAtom::NoCapture)
1675 f[atom].capture = QRegExpAtom::UnofficialCapture;
1676 cf = f.at(atom).parent;
1677}
1678#endif
1679
1680#ifndef QT_NO_REGEXP_LOOKAHEAD
1681/*
1682 Creates a lookahead anchor.
1683*/
1684int QRegExpEngine::addLookahead(QRegExpEngine *eng, bool negative)
1685{
1686 int n = ahead.size();
1687 if (n == MaxLookaheads) {
1688 error(RXERR_LIMIT);
1689 return 0;
1690 }
1691 ahead += new QRegExpLookahead(eng, negative);
1692 return Anchor_FirstLookahead << n;
1693}
1694#endif
1695
1696#ifndef QT_NO_REGEXP_CAPTURE
1697/*
1698 We want the longest leftmost captures.
1699*/
1700static bool isBetterCapture(int ncap, const int *begin1, const int *end1, const int *begin2,
1701 const int *end2)
1702{
1703 for (int i = 0; i < ncap; i++) {
1704 int delta = begin2[i] - begin1[i]; // it has to start early...
1705 if (delta == 0)
1706 delta = end1[i] - end2[i]; // ...and end late
1707
1708 if (delta != 0)
1709 return delta > 0;
1710 }
1711 return false;
1712}
1713#endif
1714
1715/*
1716 Returns true if anchor a matches at position pos + i in the input
1717 string, otherwise false.
1718*/
1719bool QRegExpMatchState::testAnchor(int i, int a, const int *capBegin)
1720{
1721 int j;
1722
1723#ifndef QT_NO_REGEXP_ANCHOR_ALT
1724 if ((a & QRegExpEngine::Anchor_Alternation) != 0)
1725 return testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).a, capBegin)
1726 || testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).b, capBegin);
1727#endif
1728
1729 if ((a & QRegExpEngine::Anchor_Caret) != 0) {
1730 if (pos + i != caretPos)
1731 return false;
1732 }
1733 if ((a & QRegExpEngine::Anchor_Dollar) != 0) {
1734 if (pos + i != len)
1735 return false;
1736 }
1737#ifndef QT_NO_REGEXP_ESCAPE
1738 if ((a & (QRegExpEngine::Anchor_Word | QRegExpEngine::Anchor_NonWord)) != 0) {
1739 bool before = false;
1740 bool after = false;
1741 if (pos + i != 0)
1742 before = isWord(in[pos + i - 1]);
1743 if (pos + i != len)
1744 after = isWord(in[pos + i]);
1745 if ((a & QRegExpEngine::Anchor_Word) != 0 && (before == after))
1746 return false;
1747 if ((a & QRegExpEngine::Anchor_NonWord) != 0 && (before != after))
1748 return false;
1749 }
1750#endif
1751#ifndef QT_NO_REGEXP_LOOKAHEAD
1752 if ((a & QRegExpEngine::Anchor_LookaheadMask) != 0) {
1753 const QVector<QRegExpLookahead *> &ahead = eng->ahead;
1754 for (j = 0; j < ahead.size(); j++) {
1755 if ((a & (QRegExpEngine::Anchor_FirstLookahead << j)) != 0) {
1756 QRegExpMatchState matchState;
1757 matchState.prepareForMatch(ahead[j]->eng);
1758 matchState.match(in + pos + i, len - pos - i, 0,
1759 true, true, matchState.caretPos - matchState.pos - i);
1760 if ((matchState.captured[0] == 0) == ahead[j]->neg)
1761 return false;
1762 }
1763 }
1764 }
1765#endif
1766#ifndef QT_NO_REGEXP_CAPTURE
1767#ifndef QT_NO_REGEXP_BACKREF
1768 for (j = 0; j < eng->nbrefs; j++) {
1769 if ((a & (QRegExpEngine::Anchor_BackRef1Empty << j)) != 0) {
1770 int i = eng->captureForOfficialCapture.at(j);
1771 if (capBegin[i] != EmptyCapture)
1772 return false;
1773 }
1774 }
1775#endif
1776#endif
1777 return true;
1778}
1779
1780#ifndef QT_NO_REGEXP_OPTIM
1781/*
1782 The three following functions are what Jeffrey Friedl would call
1783 transmissions (or bump-alongs). Using one or the other should make
1784 no difference except in performance.
1785*/
1786
1787bool QRegExpEngine::goodStringMatch(QRegExpMatchState &matchState) const
1788{
1789 int k = matchState.pos + goodEarlyStart;
1790 QStringMatcher matcher(goodStr.unicode(), goodStr.length(), cs);
1791 while ((k = matcher.indexIn(matchState.in, matchState.len, k)) != -1) {
1792 int from = k - goodLateStart;
1793 int to = k - goodEarlyStart;
1794 if (from > matchState.pos)
1795 matchState.pos = from;
1796
1797 while (matchState.pos <= to) {
1798 if (matchState.matchHere())
1799 return true;
1800 ++matchState.pos;
1801 }
1802 ++k;
1803 }
1804 return false;
1805}
1806
1807bool QRegExpEngine::badCharMatch(QRegExpMatchState &matchState) const
1808{
1809 int slideHead = 0;
1810 int slideNext = 0;
1811 int i;
1812 int lastPos = matchState.len - minl;
1813 memset(matchState.slideTab, 0, matchState.slideTabSize * sizeof(int));
1814
1815 /*
1816 Set up the slide table, used for the bad-character heuristic,
1817 using the table of first occurrence of each character.
1818 */
1819 for (i = 0; i < minl; i++) {
1820 int sk = occ1[BadChar(matchState.in[matchState.pos + i])];
1821 if (sk == NoOccurrence)
1822 sk = i + 1;
1823 if (sk > 0) {
1824 int k = i + 1 - sk;
1825 if (k < 0) {
1826 sk = i + 1;
1827 k = 0;
1828 }
1829 if (sk > matchState.slideTab[k])
1830 matchState.slideTab[k] = sk;
1831 }
1832 }
1833
1834 if (matchState.pos > lastPos)
1835 return false;
1836
1837 for (;;) {
1838 if (++slideNext >= matchState.slideTabSize)
1839 slideNext = 0;
1840 if (matchState.slideTab[slideHead] > 0) {
1841 if (matchState.slideTab[slideHead] - 1 > matchState.slideTab[slideNext])
1842 matchState.slideTab[slideNext] = matchState.slideTab[slideHead] - 1;
1843 matchState.slideTab[slideHead] = 0;
1844 } else {
1845 if (matchState.matchHere())
1846 return true;
1847 }
1848
1849 if (matchState.pos == lastPos)
1850 break;
1851
1852 /*
1853 Update the slide table. This code has much in common with
1854 the initialization code.
1855 */
1856 int sk = occ1[BadChar(matchState.in[matchState.pos + minl])];
1857 if (sk == NoOccurrence) {
1858 matchState.slideTab[slideNext] = minl;
1859 } else if (sk > 0) {
1860 int k = slideNext + minl - sk;
1861 if (k >= matchState.slideTabSize)
1862 k -= matchState.slideTabSize;
1863 if (sk > matchState.slideTab[k])
1864 matchState.slideTab[k] = sk;
1865 }
1866 slideHead = slideNext;
1867 ++matchState.pos;
1868 }
1869 return false;
1870}
1871#else
1872bool QRegExpEngine::bruteMatch(QRegExpMatchState &matchState) const
1873{
1874 while (matchState.pos <= matchState.len) {
1875 if (matchState.matchHere())
1876 return true;
1877 ++matchState.pos;
1878 }
1879 return false;
1880}
1881#endif
1882
1883/*
1884 Here's the core of the engine. It tries to do a match here and now.
1885*/
1886bool QRegExpMatchState::matchHere()
1887{
1888 int ncur = 1, nnext = 0;
1889 int i = 0, j, k, m;
1890 bool stop = false;
1891
1892 matchLen = -1;
1893 oneTestMatchedLen = -1;
1894 curStack[0] = QRegExpEngine::InitialState;
1895
1896 int ncap = eng->ncap;
1897#ifndef QT_NO_REGEXP_CAPTURE
1898 if (ncap > 0) {
1899 for (j = 0; j < ncap; j++) {
1900 curCapBegin[j] = EmptyCapture;
1901 curCapEnd[j] = EmptyCapture;
1902 }
1903 }
1904#endif
1905
1906#ifndef QT_NO_REGEXP_BACKREF
1907 while ((ncur > 0 || !sleeping.isEmpty()) && i <= len - pos && !stop)
1908#else
1909 while (ncur > 0 && i <= len - pos && !stop)
1910#endif
1911 {
1912 int ch = (i < len - pos) ? in[pos + i].unicode() : 0;
1913 for (j = 0; j < ncur; j++) {
1914 int cur = curStack[j];
1915 const QRegExpAutomatonState &scur = eng->s.at(cur);
1916 const QVector<int> &outs = scur.outs;
1917 for (k = 0; k < outs.size(); k++) {
1918 int next = outs.at(k);
1919 const QRegExpAutomatonState &snext = eng->s.at(next);
1920 bool inside = true;
1921#if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
1922 int needSomeSleep = 0;
1923#endif
1924
1925 /*
1926 First, check if the anchors are anchored properly.
1927 */
1928 int a = scur.anchors.value(next);
1929 if (a != 0 && !testAnchor(i, a, curCapBegin + j * ncap))
1930 inside = false;
1931
1932 /*
1933 If indeed they are, check if the input character is
1934 correct for this transition.
1935 */
1936 if (inside) {
1937 m = snext.match;
1938 if ((m & (QRegExpEngine::CharClassBit | QRegExpEngine::BackRefBit)) == 0) {
1939 if (eng->cs)
1940 inside = (m == ch);
1941 else
1942 inside = (QChar(m).toLower() == QChar(ch).toLower());
1943 } else if (next == QRegExpEngine::FinalState) {
1944 matchLen = i;
1945 stop = minimal;
1946 inside = true;
1947 } else if ((m & QRegExpEngine::CharClassBit) != 0) {
1948#ifndef QT_NO_REGEXP_CCLASS
1949 const QRegExpCharClass &cc = eng->cl.at(m ^ QRegExpEngine::CharClassBit);
1950 if (eng->cs)
1951 inside = cc.in(ch);
1952 else if (cc.negative())
1953 inside = cc.in(QChar(ch).toLower()) &&
1954 cc.in(QChar(ch).toUpper());
1955 else
1956 inside = cc.in(QChar(ch).toLower()) ||
1957 cc.in(QChar(ch).toUpper());
1958#endif
1959#if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
1960 } else { /* ((m & QRegExpEngine::BackRefBit) != 0) */
1961 int bref = m ^ QRegExpEngine::BackRefBit;
1962 int ell = j * ncap + eng->captureForOfficialCapture.at(bref - 1);
1963
1964 inside = bref <= ncap && curCapBegin[ell] != EmptyCapture;
1965 if (inside) {
1966 if (eng->cs)
1967 inside = (in[pos + curCapBegin[ell]] == QChar(ch));
1968 else
1969 inside = (in[pos + curCapBegin[ell]].toLower()
1970 == QChar(ch).toLower());
1971 }
1972
1973 if (inside) {
1974 int delta;
1975 if (curCapEnd[ell] == EmptyCapture)
1976 delta = i - curCapBegin[ell];
1977 else
1978 delta = curCapEnd[ell] - curCapBegin[ell];
1979
1980 inside = (delta <= len - (pos + i));
1981 if (inside && delta > 1) {
1982 int n = 1;
1983 if (eng->cs) {
1984 while (n < delta) {
1985 if (in[pos + curCapBegin[ell] + n]
1986 != in[pos + i + n])
1987 break;
1988 ++n;
1989 }
1990 } else {
1991 while (n < delta) {
1992 QChar a = in[pos + curCapBegin[ell] + n];
1993 QChar b = in[pos + i + n];
1994 if (a.toLower() != b.toLower())
1995 break;
1996 ++n;
1997 }
1998 }
1999 inside = (n == delta);
2000 if (inside)
2001 needSomeSleep = delta - 1;
2002 }
2003 }
2004#endif
2005 }
2006 }
2007
2008 /*
2009 We must now update our data structures.
2010 */
2011 if (inside) {
2012#ifndef QT_NO_REGEXP_CAPTURE
2013 int *capBegin, *capEnd;
2014#endif
2015 /*
2016 If the next state was not encountered yet, all
2017 is fine.
2018 */
2019 if ((m = inNextStack[next]) == -1) {
2020 m = nnext++;
2021 nextStack[m] = next;
2022 inNextStack[next] = m;
2023#ifndef QT_NO_REGEXP_CAPTURE
2024 capBegin = nextCapBegin + m * ncap;
2025 capEnd = nextCapEnd + m * ncap;
2026
2027 /*
2028 Otherwise, we'll first maintain captures in
2029 temporary arrays, and decide at the end whether
2030 it's best to keep the previous capture zones or
2031 the new ones.
2032 */
2033 } else {
2034 capBegin = tempCapBegin;
2035 capEnd = tempCapEnd;
2036#endif
2037 }
2038
2039#ifndef QT_NO_REGEXP_CAPTURE
2040 /*
2041 Updating the capture zones is much of a task.
2042 */
2043 if (ncap > 0) {
2044 memcpy(capBegin, curCapBegin + j * ncap, ncap * sizeof(int));
2045 memcpy(capEnd, curCapEnd + j * ncap, ncap * sizeof(int));
2046 int c = scur.atom, n = snext.atom;
2047 int p = -1, q = -1;
2048 int cap;
2049
2050 /*
2051 Lemma 1. For any x in the range [0..nf), we
2052 have f[x].parent < x.
2053
2054 Proof. By looking at startAtom(), it is
2055 clear that cf < nf holds all the time, and
2056 thus that f[nf].parent < nf.
2057 */
2058
2059 /*
2060 If we are reentering an atom, we empty all
2061 capture zones inside it.
2062 */
2063 if ((q = scur.reenter.value(next)) != 0) {
2064 QBitArray b(eng->nf, false);
2065 b.setBit(q, true);
2066 for (int ell = q + 1; ell < eng->nf; ell++) {
2067 if (b.testBit(eng->f.at(ell).parent)) {
2068 b.setBit(ell, true);
2069 cap = eng->f.at(ell).capture;
2070 if (cap >= 0) {
2071 capBegin[cap] = EmptyCapture;
2072 capEnd[cap] = EmptyCapture;
2073 }
2074 }
2075 }
2076 p = eng->f.at(q).parent;
2077
2078 /*
2079 Otherwise, close the capture zones we are
2080 leaving. We are leaving f[c].capture,
2081 f[f[c].parent].capture,
2082 f[f[f[c].parent].parent].capture, ...,
2083 until f[x].capture, with x such that
2084 f[x].parent is the youngest common ancestor
2085 for c and n.
2086
2087 We go up along c's and n's ancestry until
2088 we find x.
2089 */
2090 } else {
2091 p = c;
2092 q = n;
2093 while (p != q) {
2094 if (p > q) {
2095 cap = eng->f.at(p).capture;
2096 if (cap >= 0) {
2097 if (capBegin[cap] == i) {
2098 capBegin[cap] = EmptyCapture;
2099 capEnd[cap] = EmptyCapture;
2100 } else {
2101 capEnd[cap] = i;
2102 }
2103 }
2104 p = eng->f.at(p).parent;
2105 } else {
2106 q = eng->f.at(q).parent;
2107 }
2108 }
2109 }
2110
2111 /*
2112 In any case, we now open the capture zones
2113 we are entering. We work upwards from n
2114 until we reach p (the parent of the atom we
2115 reenter or the youngest common ancestor).
2116 */
2117 while (n > p) {
2118 cap = eng->f.at(n).capture;
2119 if (cap >= 0) {
2120 capBegin[cap] = i;
2121 capEnd[cap] = EmptyCapture;
2122 }
2123 n = eng->f.at(n).parent;
2124 }
2125 /*
2126 If the next state was already in
2127 nextStack, we must choose carefully which
2128 capture zones we want to keep.
2129 */
2130 if (capBegin == tempCapBegin &&
2131 isBetterCapture(ncap, capBegin, capEnd, nextCapBegin + m * ncap,
2132 nextCapEnd + m * ncap)) {
2133 memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int));
2134 memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int));
2135 }
2136 }
2137#ifndef QT_NO_REGEXP_BACKREF
2138 /*
2139 We are done with updating the capture zones.
2140 It's now time to put the next state to sleep,
2141 if it needs to, and to remove it from
2142 nextStack.
2143 */
2144 if (needSomeSleep > 0) {
2145 QVector<int> zzZ(2 + 2 * ncap);
2146 zzZ[0] = i + needSomeSleep;
2147 zzZ[1] = next;
2148 if (ncap > 0) {
2149 memcpy(zzZ.data() + 2, capBegin, ncap * sizeof(int));
2150 memcpy(zzZ.data() + 2 + ncap, capEnd, ncap * sizeof(int));
2151 }
2152 inNextStack[nextStack[--nnext]] = -1;
2153 sleeping.append(zzZ);
2154 }
2155#endif
2156#endif
2157 }
2158 }
2159 }
2160#ifndef QT_NO_REGEXP_CAPTURE
2161 /*
2162 If we reached the final state, hurray! Copy the captured
2163 zone.
2164 */
2165 if (ncap > 0 && (m = inNextStack[QRegExpEngine::FinalState]) != -1) {
2166 memcpy(capBegin, nextCapBegin + m * ncap, ncap * sizeof(int));
2167 memcpy(capEnd, nextCapEnd + m * ncap, ncap * sizeof(int));
2168 }
2169#ifndef QT_NO_REGEXP_BACKREF
2170 /*
2171 It's time to wake up the sleepers.
2172 */
2173 j = 0;
2174 while (j < sleeping.count()) {
2175 if (sleeping.at(j)[0] == i) {
2176 const QVector<int> &zzZ = sleeping.at(j);
2177 int next = zzZ[1];
2178 const int *capBegin = zzZ.data() + 2;
2179 const int *capEnd = zzZ.data() + 2 + ncap;
2180 bool copyOver = true;
2181
2182 if ((m = inNextStack[next]) == -1) {
2183 m = nnext++;
2184 nextStack[m] = next;
2185 inNextStack[next] = m;
2186 } else {
2187 copyOver = isBetterCapture(ncap, nextCapBegin + m * ncap, nextCapEnd + m * ncap,
2188 capBegin, capEnd);
2189 }
2190 if (copyOver) {
2191 memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int));
2192 memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int));
2193 }
2194
2195 sleeping.removeAt(j);
2196 } else {
2197 ++j;
2198 }
2199 }
2200#endif
2201#endif
2202 for (j = 0; j < nnext; j++)
2203 inNextStack[nextStack[j]] = -1;
2204
2205 // avoid needless iteration that confuses oneTestMatchedLen
2206 if (nnext == 1 && nextStack[0] == QRegExpEngine::FinalState
2207#ifndef QT_NO_REGEXP_BACKREF
2208 && sleeping.isEmpty()
2209#endif
2210 )
2211 stop = true;
2212
2213 qSwap(curStack, nextStack);
2214#ifndef QT_NO_REGEXP_CAPTURE
2215 qSwap(curCapBegin, nextCapBegin);
2216 qSwap(curCapEnd, nextCapEnd);
2217#endif
2218 ncur = nnext;
2219 nnext = 0;
2220 ++i;
2221 }
2222
2223#ifndef QT_NO_REGEXP_BACKREF
2224 /*
2225 If minimal matching is enabled, we might have some sleepers
2226 left.
2227 */
2228 if (!sleeping.isEmpty())
2229 sleeping.clear();
2230#endif
2231
2232 oneTestMatchedLen = i - 1;
2233 return (matchLen >= 0);
2234}
2235
2236#ifndef QT_NO_REGEXP_CCLASS
2237
2238QRegExpCharClass::QRegExpCharClass()
2239 : c(0), n(false)
2240{
2241#ifndef QT_NO_REGEXP_OPTIM
2242 occ1.fill(NoOccurrence, NumBadChars);
2243#endif
2244}
2245
2246QRegExpCharClass &QRegExpCharClass::operator=(const QRegExpCharClass &cc)
2247{
2248 c = cc.c;
2249 r = cc.r;
2250 n = cc.n;
2251#ifndef QT_NO_REGEXP_OPTIM
2252 occ1 = cc.occ1;
2253#endif
2254 return *this;
2255}
2256
2257void QRegExpCharClass::clear()
2258{
2259 c = 0;
2260 r.resize(0);
2261 n = false;
2262}
2263
2264void QRegExpCharClass::setNegative(bool negative)
2265{
2266 n = negative;
2267#ifndef QT_NO_REGEXP_OPTIM
2268 occ1.fill(0, NumBadChars);
2269#endif
2270}
2271
2272void QRegExpCharClass::addCategories(int cats)
2273{
2274 c |= cats;
2275#ifndef QT_NO_REGEXP_OPTIM
2276 occ1.fill(0, NumBadChars);
2277#endif
2278}
2279
2280void QRegExpCharClass::addRange(ushort from, ushort to)
2281{
2282 if (from > to)
2283 qSwap(from, to);
2284 int m = r.size();
2285 r.resize(m + 1);
2286 r[m].from = from;
2287 r[m].len = to - from + 1;
2288
2289#ifndef QT_NO_REGEXP_OPTIM
2290 int i;
2291
2292 if (to - from < NumBadChars) {
2293 if (from % NumBadChars <= to % NumBadChars) {
2294 for (i = from % NumBadChars; i <= to % NumBadChars; i++)
2295 occ1[i] = 0;
2296 } else {
2297 for (i = 0; i <= to % NumBadChars; i++)
2298 occ1[i] = 0;
2299 for (i = from % NumBadChars; i < NumBadChars; i++)
2300 occ1[i] = 0;
2301 }
2302 } else {
2303 occ1.fill(0, NumBadChars);
2304 }
2305#endif
2306}
2307
2308bool QRegExpCharClass::in(QChar ch) const
2309{
2310#ifndef QT_NO_REGEXP_OPTIM
2311 if (occ1.at(BadChar(ch)) == NoOccurrence)
2312 return n;
2313#endif
2314
2315 if (c != 0 && (c & (1 << (int)ch.category())) != 0)
2316 return !n;
2317
2318 const int uc = ch.unicode();
2319 int size = r.size();
2320
2321 for (int i = 0; i < size; ++i) {
2322 const QRegExpCharClassRange &range = r.at(i);
2323 if (uint(uc - range.from) < uint(r.at(i).len))
2324 return !n;
2325 }
2326 return n;
2327}
2328
2329#if defined(QT_DEBUG)
2330void QRegExpCharClass::dump() const
2331{
2332 int i;
2333 qDebug(" %stive character class", n ? "nega" : "posi");
2334#ifndef QT_NO_REGEXP_CCLASS
2335 if (c != 0)
2336 qDebug(" categories 0x%.8x", c);
2337#endif
2338 for (i = 0; i < r.size(); i++)
2339 qDebug(" 0x%.4x through 0x%.4x", r[i].from, r[i].from + r[i].len - 1);
2340}
2341#endif
2342#endif
2343
2344QRegExpEngine::Box::Box(QRegExpEngine *engine)
2345 : eng(engine), skipanchors(0)
2346#ifndef QT_NO_REGEXP_OPTIM
2347 , earlyStart(0), lateStart(0), maxl(0)
2348#endif
2349{
2350#ifndef QT_NO_REGEXP_OPTIM
2351 occ1.fill(NoOccurrence, NumBadChars);
2352#endif
2353 minl = 0;
2354}
2355
2356QRegExpEngine::Box &QRegExpEngine::Box::operator=(const Box &b)
2357{
2358 eng = b.eng;
2359 ls = b.ls;
2360 rs = b.rs;
2361 lanchors = b.lanchors;
2362 ranchors = b.ranchors;
2363 skipanchors = b.skipanchors;
2364#ifndef QT_NO_REGEXP_OPTIM
2365 earlyStart = b.earlyStart;
2366 lateStart = b.lateStart;
2367 str = b.str;
2368 leftStr = b.leftStr;
2369 rightStr = b.rightStr;
2370 maxl = b.maxl;
2371 occ1 = b.occ1;
2372#endif
2373 minl = b.minl;
2374 return *this;
2375}
2376
2377void QRegExpEngine::Box::set(QChar ch)
2378{
2379 ls.resize(1);
2380 ls[0] = eng->createState(ch);
2381 rs = ls;
2382#ifndef QT_NO_REGEXP_OPTIM
2383 str = ch;
2384 leftStr = ch;
2385 rightStr = ch;
2386 maxl = 1;
2387 occ1[BadChar(ch)] = 0;
2388#endif
2389 minl = 1;
2390}
2391
2392void QRegExpEngine::Box::set(const QRegExpCharClass &cc)
2393{
2394 ls.resize(1);
2395 ls[0] = eng->createState(cc);
2396 rs = ls;
2397#ifndef QT_NO_REGEXP_OPTIM
2398 maxl = 1;
2399 occ1 = cc.firstOccurrence();
2400#endif
2401 minl = 1;
2402}
2403
2404#ifndef QT_NO_REGEXP_BACKREF
2405void QRegExpEngine::Box::set(int bref)
2406{
2407 ls.resize(1);
2408 ls[0] = eng->createState(bref);
2409 rs = ls;
2410 if (bref >= 1 && bref <= MaxBackRefs)
2411 skipanchors = Anchor_BackRef0Empty << bref;
2412#ifndef QT_NO_REGEXP_OPTIM
2413 maxl = InftyLen;
2414#endif
2415 minl = 0;
2416}
2417#endif
2418
2419void QRegExpEngine::Box::cat(const Box &b)
2420{
2421 eng->addCatTransitions(rs, b.ls);
2422 addAnchorsToEngine(b);
2423 if (minl == 0) {
2424 lanchors.unite(b.lanchors);
2425 if (skipanchors != 0) {
2426 for (int i = 0; i < b.ls.size(); i++) {
2427 int a = eng->anchorConcatenation(lanchors.value(b.ls.at(i), 0), skipanchors);
2428 lanchors.insert(b.ls.at(i), a);
2429 }
2430 }
2431 mergeInto(&ls, b.ls);
2432 }
2433 if (b.minl == 0) {
2434 ranchors.unite(b.ranchors);
2435 if (b.skipanchors != 0) {
2436 for (int i = 0; i < rs.size(); i++) {
2437 int a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), b.skipanchors);
2438 ranchors.insert(rs.at(i), a);
2439 }
2440 }
2441 mergeInto(&rs, b.rs);
2442 } else {
2443 ranchors = b.ranchors;
2444 rs = b.rs;
2445 }
2446
2447#ifndef QT_NO_REGEXP_OPTIM
2448 if (maxl != InftyLen) {
2449 if (rightStr.length() + b.leftStr.length() >
2450 qMax(str.length(), b.str.length())) {
2451 earlyStart = minl - rightStr.length();
2452 lateStart = maxl - rightStr.length();
2453 str = rightStr + b.leftStr;
2454 } else if (b.str.length() > str.length()) {
2455 earlyStart = minl + b.earlyStart;
2456 lateStart = maxl + b.lateStart;
2457 str = b.str;
2458 }
2459 }
2460
2461 if (leftStr.length() == maxl)
2462 leftStr += b.leftStr;
2463
2464 if (b.rightStr.length() == b.maxl) {
2465 rightStr += b.rightStr;
2466 } else {
2467 rightStr = b.rightStr;
2468 }
2469
2470 if (maxl == InftyLen || b.maxl == InftyLen) {
2471 maxl = InftyLen;
2472 } else {
2473 maxl += b.maxl;
2474 }
2475
2476 for (int i = 0; i < NumBadChars; i++) {
2477 if (b.occ1.at(i) != NoOccurrence && minl + b.occ1.at(i) < occ1.at(i))
2478 occ1[i] = minl + b.occ1.at(i);
2479 }
2480#endif
2481
2482 minl += b.minl;
2483 if (minl == 0)
2484 skipanchors = eng->anchorConcatenation(skipanchors, b.skipanchors);
2485 else
2486 skipanchors = 0;
2487}
2488
2489void QRegExpEngine::Box::orx(const Box &b)
2490{
2491 mergeInto(&ls, b.ls);
2492 lanchors.unite(b.lanchors);
2493 mergeInto(&rs, b.rs);
2494 ranchors.unite(b.ranchors);
2495
2496 if (b.minl == 0) {
2497 if (minl == 0)
2498 skipanchors = eng->anchorAlternation(skipanchors, b.skipanchors);
2499 else
2500 skipanchors = b.skipanchors;
2501 }
2502
2503#ifndef QT_NO_REGEXP_OPTIM
2504 for (int i = 0; i < NumBadChars; i++) {
2505 if (occ1.at(i) > b.occ1.at(i))
2506 occ1[i] = b.occ1.at(i);
2507 }
2508 earlyStart = 0;
2509 lateStart = 0;
2510 str = QString();
2511 leftStr = QString();
2512 rightStr = QString();
2513 if (b.maxl > maxl)
2514 maxl = b.maxl;
2515#endif
2516 if (b.minl < minl)
2517 minl = b.minl;
2518}
2519
2520void QRegExpEngine::Box::plus(int atom)
2521{
2522#ifndef QT_NO_REGEXP_CAPTURE
2523 eng->addPlusTransitions(rs, ls, atom);
2524#else
2525 Q_UNUSED(atom);
2526 eng->addCatTransitions(rs, ls);
2527#endif
2528 addAnchorsToEngine(*this);
2529#ifndef QT_NO_REGEXP_OPTIM
2530 maxl = InftyLen;
2531#endif
2532}
2533
2534void QRegExpEngine::Box::opt()
2535{
2536#ifndef QT_NO_REGEXP_OPTIM
2537 earlyStart = 0;
2538 lateStart = 0;
2539 str = QString();
2540 leftStr = QString();
2541 rightStr = QString();
2542#endif
2543 skipanchors = 0;
2544 minl = 0;
2545}
2546
2547void QRegExpEngine::Box::catAnchor(int a)
2548{
2549 if (a != 0) {
2550 for (int i = 0; i < rs.size(); i++) {
2551 a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), a);
2552 ranchors.insert(rs.at(i), a);
2553 }
2554 if (minl == 0)
2555 skipanchors = eng->anchorConcatenation(skipanchors, a);
2556 }
2557}
2558
2559#ifndef QT_NO_REGEXP_OPTIM
2560void QRegExpEngine::Box::setupHeuristics()
2561{
2562 eng->goodEarlyStart = earlyStart;
2563 eng->goodLateStart = lateStart;
2564 eng->goodStr = eng->cs ? str : str.toLower();
2565
2566 eng->minl = minl;
2567 if (eng->cs) {
2568 /*
2569 A regular expression such as 112|1 has occ1['2'] = 2 and minl =
2570 1 at this point. An entry of occ1 has to be at most minl or
2571 infinity for the rest of the algorithm to go well.
2572
2573 We waited until here before normalizing these cases (instead of
2574 doing it in Box::orx()) because sometimes things improve by
2575 themselves. Consider for example (112|1)34.
2576 */
2577 for (int i = 0; i < NumBadChars; i++) {
2578 if (occ1.at(i) != NoOccurrence && occ1.at(i) >= minl)
2579 occ1[i] = minl;
2580 }
2581 eng->occ1 = occ1;
2582 } else {
2583 eng->occ1.fill(0, NumBadChars);
2584 }
2585
2586 eng->heuristicallyChooseHeuristic();
2587}
2588#endif
2589
2590#if defined(QT_DEBUG)
2591void QRegExpEngine::Box::dump() const
2592{
2593 int i;
2594 qDebug("Box of at least %d character%s", minl, minl == 1 ? "" : "s");
2595 qDebug(" Left states:");
2596 for (i = 0; i < ls.size(); i++) {
2597 if (lanchors.value(ls[i], 0) == 0)
2598 qDebug(" %d", ls[i]);
2599 else
2600 qDebug(" %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]]);
2601 }
2602 qDebug(" Right states:");
2603 for (i = 0; i < rs.size(); i++) {
2604 if (ranchors.value(rs[i], 0) == 0)
2605 qDebug(" %d", rs[i]);
2606 else
2607 qDebug(" %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]]);
2608 }
2609 qDebug(" Skip anchors: 0x%.8x", skipanchors);
2610}
2611#endif
2612
2613void QRegExpEngine::Box::addAnchorsToEngine(const Box &to) const
2614{
2615 for (int i = 0; i < to.ls.size(); i++) {
2616 for (int j = 0; j < rs.size(); j++) {
2617 int a = eng->anchorConcatenation(ranchors.value(rs.at(j), 0),
2618 to.lanchors.value(to.ls.at(i), 0));
2619 eng->addAnchors(rs[j], to.ls[i], a);
2620 }
2621 }
2622}
2623
2624int QRegExpEngine::getChar()
2625{
2626 return (yyPos == yyLen) ? EOS : yyIn[yyPos++].unicode();
2627}
2628
2629int QRegExpEngine::getEscape()
2630{
2631#ifndef QT_NO_REGEXP_ESCAPE
2632 const char tab[] = "afnrtv"; // no b, as \b means word boundary
2633 const char backTab[] = "\a\f\n\r\t\v";
2634 ushort low;
2635 int i;
2636#endif
2637 ushort val;
2638 int prevCh = yyCh;
2639
2640 if (prevCh == EOS) {
2641 error(RXERR_END);
2642 return Tok_Char | '\\';
2643 }
2644 yyCh = getChar();
2645#ifndef QT_NO_REGEXP_ESCAPE
2646 if ((prevCh & ~0xff) == 0) {
2647 const char *p = strchr(tab, prevCh);
2648 if (p != 0)
2649 return Tok_Char | backTab[p - tab];
2650 }
2651#endif
2652
2653 switch (prevCh) {
2654#ifndef QT_NO_REGEXP_ESCAPE
2655 case '0':
2656 val = 0;
2657 for (i = 0; i < 3; i++) {
2658 if (yyCh >= '0' && yyCh <= '7')
2659 val = (val << 3) | (yyCh - '0');
2660 else
2661 break;
2662 yyCh = getChar();
2663 }
2664 if ((val & ~0377) != 0)
2665 error(RXERR_OCTAL);
2666 return Tok_Char | val;
2667#endif
2668#ifndef QT_NO_REGEXP_ESCAPE
2669 case 'B':
2670 return Tok_NonWord;
2671#endif
2672#ifndef QT_NO_REGEXP_CCLASS
2673 case 'D':
2674 // see QChar::isDigit()
2675 yyCharClass->addCategories(0x7fffffef);
2676 return Tok_CharClass;
2677 case 'S':
2678 // see QChar::isSpace()
2679 yyCharClass->addCategories(0x7ffff87f);
2680 yyCharClass->addRange(0x0000, 0x0008);
2681 yyCharClass->addRange(0x000e, 0x001f);
2682 yyCharClass->addRange(0x007f, 0x009f);
2683 return Tok_CharClass;
2684 case 'W':
2685 // see QChar::isLetterOrNumber() and QChar::isMark()
2686 yyCharClass->addCategories(0x7fe07f81);
2687 yyCharClass->addRange(0x203f, 0x2040);
2688 yyCharClass->addSingleton(0x2040);
2689 yyCharClass->addSingleton(0x2054);
2690 yyCharClass->addSingleton(0x30fb);
2691 yyCharClass->addRange(0xfe33, 0xfe34);
2692 yyCharClass->addRange(0xfe4d, 0xfe4f);
2693 yyCharClass->addSingleton(0xff3f);
2694 yyCharClass->addSingleton(0xff65);
2695 return Tok_CharClass;
2696#endif
2697#ifndef QT_NO_REGEXP_ESCAPE
2698 case 'b':
2699 return Tok_Word;
2700#endif
2701#ifndef QT_NO_REGEXP_CCLASS
2702 case 'd':
2703 // see QChar::isDigit()
2704 yyCharClass->addCategories(0x00000010);
2705 return Tok_CharClass;
2706 case 's':
2707 // see QChar::isSpace()
2708 yyCharClass->addCategories(0x00000380);
2709 yyCharClass->addRange(0x0009, 0x000d);
2710 return Tok_CharClass;
2711 case 'w':
2712 // see QChar::isLetterOrNumber() and QChar::isMark()
2713 yyCharClass->addCategories(0x000f807e);
2714 yyCharClass->addSingleton(0x005f); // '_'
2715 return Tok_CharClass;
2716#endif
2717#ifndef QT_NO_REGEXP_ESCAPE
2718 case 'x':
2719 val = 0;
2720 for (i = 0; i < 4; i++) {
2721 low = QChar(yyCh).toLower().unicode();
2722 if (low >= '0' && low <= '9')
2723 val = (val << 4) | (low - '0');
2724 else if (low >= 'a' && low <= 'f')
2725 val = (val << 4) | (low - 'a' + 10);
2726 else
2727 break;
2728 yyCh = getChar();
2729 }
2730 return Tok_Char | val;
2731#endif
2732 default:
2733 if (prevCh >= '1' && prevCh <= '9') {
2734#ifndef QT_NO_REGEXP_BACKREF
2735 val = prevCh - '0';
2736 while (yyCh >= '0' && yyCh <= '9') {
2737 val = (val * 10) + (yyCh - '0');
2738 yyCh = getChar();
2739 }
2740 return Tok_BackRef | val;
2741#else
2742 error(RXERR_DISABLED);
2743#endif
2744 }
2745 return Tok_Char | prevCh;
2746 }
2747}
2748
2749#ifndef QT_NO_REGEXP_INTERVAL
2750int QRegExpEngine::getRep(int def)
2751{
2752 if (yyCh >= '0' && yyCh <= '9') {
2753 int rep = 0;
2754 do {
2755 rep = 10 * rep + yyCh - '0';
2756 if (rep >= InftyRep) {
2757 error(RXERR_REPETITION);
2758 rep = def;
2759 }
2760 yyCh = getChar();
2761 } while (yyCh >= '0' && yyCh <= '9');
2762 return rep;
2763 } else {
2764 return def;
2765 }
2766}
2767#endif
2768
2769#ifndef QT_NO_REGEXP_LOOKAHEAD
2770void QRegExpEngine::skipChars(int n)
2771{
2772 if (n > 0) {
2773 yyPos += n - 1;
2774 yyCh = getChar();
2775 }
2776}
2777#endif
2778
2779void QRegExpEngine::error(const char *msg)
2780{
2781 if (yyError.isEmpty())
2782 yyError = QLatin1String(msg);
2783}
2784
2785void QRegExpEngine::startTokenizer(const QChar *rx, int len)
2786{
2787 yyIn = rx;
2788 yyPos0 = 0;
2789 yyPos = 0;
2790 yyLen = len;
2791 yyCh = getChar();
2792 yyCharClass = new QRegExpCharClass;
2793 yyMinRep = 0;
2794 yyMaxRep = 0;
2795 yyError = QString();
2796}
2797
2798int QRegExpEngine::getToken()
2799{
2800#ifndef QT_NO_REGEXP_CCLASS
2801 ushort pendingCh = 0;
2802 bool charPending;
2803 bool rangePending;
2804 int tok;
2805#endif
2806 int prevCh = yyCh;
2807
2808 yyPos0 = yyPos - 1;
2809#ifndef QT_NO_REGEXP_CCLASS
2810 yyCharClass->clear();
2811#endif
2812 yyMinRep = 0;
2813 yyMaxRep = 0;
2814 yyCh = getChar();
2815
2816 switch (prevCh) {
2817 case EOS:
2818 yyPos0 = yyPos;
2819 return Tok_Eos;
2820 case '$':
2821 return Tok_Dollar;
2822 case '(':
2823 if (yyCh == '?') {
2824 prevCh = getChar();
2825 yyCh = getChar();
2826 switch (prevCh) {
2827#ifndef QT_NO_REGEXP_LOOKAHEAD
2828 case '!':
2829 return Tok_NegLookahead;
2830 case '=':
2831 return Tok_PosLookahead;
2832#endif
2833 case ':':
2834 return Tok_MagicLeftParen;
2835 default:
2836 error(RXERR_LOOKAHEAD);
2837 return Tok_MagicLeftParen;
2838 }
2839 } else {
2840 return Tok_LeftParen;
2841 }
2842 case ')':
2843 return Tok_RightParen;
2844 case '*':
2845 yyMinRep = 0;
2846 yyMaxRep = InftyRep;
2847 return Tok_Quantifier;
2848 case '+':
2849 yyMinRep = 1;
2850 yyMaxRep = InftyRep;
2851 return Tok_Quantifier;
2852 case '.':
2853#ifndef QT_NO_REGEXP_CCLASS
2854 yyCharClass->setNegative(true);
2855#endif
2856 return Tok_CharClass;
2857 case '?':
2858 yyMinRep = 0;
2859 yyMaxRep = 1;
2860 return Tok_Quantifier;
2861 case '[':
2862#ifndef QT_NO_REGEXP_CCLASS
2863 if (yyCh == '^') {
2864 yyCharClass->setNegative(true);
2865 yyCh = getChar();
2866 }
2867 charPending = false;
2868 rangePending = false;
2869 do {
2870 if (yyCh == '-' && charPending && !rangePending) {
2871 rangePending = true;
2872 yyCh = getChar();
2873 } else {
2874 if (charPending && !rangePending) {
2875 yyCharClass->addSingleton(pendingCh);
2876 charPending = false;
2877 }
2878 if (yyCh == '\\') {
2879 yyCh = getChar();
2880 tok = getEscape();
2881 if (tok == Tok_Word)
2882 tok = '\b';
2883 } else {
2884 tok = Tok_Char | yyCh;
2885 yyCh = getChar();
2886 }
2887 if (tok == Tok_CharClass) {
2888 if (rangePending) {
2889 yyCharClass->addSingleton('-');
2890 yyCharClass->addSingleton(pendingCh);
2891 charPending = false;
2892 rangePending = false;
2893 }
2894 } else if ((tok & Tok_Char) != 0) {
2895 if (rangePending) {
2896 yyCharClass->addRange(pendingCh, tok ^ Tok_Char);
2897 charPending = false;
2898 rangePending = false;
2899 } else {
2900 pendingCh = tok ^ Tok_Char;
2901 charPending = true;
2902 }
2903 } else {
2904 error(RXERR_CHARCLASS);
2905 }
2906 }
2907 } while (yyCh != ']' && yyCh != EOS);
2908 if (rangePending)
2909 yyCharClass->addSingleton('-');
2910 if (charPending)
2911 yyCharClass->addSingleton(pendingCh);
2912 if (yyCh == EOS)
2913 error(RXERR_END);
2914 else
2915 yyCh = getChar();
2916 return Tok_CharClass;
2917#else
2918 error(RXERR_END);
2919 return Tok_Char | '[';
2920#endif
2921 case '\\':
2922 return getEscape();
2923 case ']':
2924 error(RXERR_LEFTDELIM);
2925 return Tok_Char | ']';
2926 case '^':
2927 return Tok_Caret;
2928 case '{':
2929#ifndef QT_NO_REGEXP_INTERVAL
2930 yyMinRep = getRep(0);
2931 yyMaxRep = yyMinRep;
2932 if (yyCh == ',') {
2933 yyCh = getChar();
2934 yyMaxRep = getRep(InftyRep);
2935 }
2936 if (yyMaxRep < yyMinRep)
2937 qSwap(yyMinRep, yyMaxRep);
2938 if (yyCh != '}')
2939 error(RXERR_REPETITION);
2940 yyCh = getChar();
2941 return Tok_Quantifier;
2942#else
2943 error(RXERR_DISABLED);
2944 return Tok_Char | '{';
2945#endif
2946 case '|':
2947 return Tok_Bar;
2948 case '}':
2949 error(RXERR_LEFTDELIM);
2950 return Tok_Char | '}';
2951 default:
2952 return Tok_Char | prevCh;
2953 }
2954}
2955
2956int QRegExpEngine::parse(const QChar *pattern, int len)
2957{
2958 valid = true;
2959 startTokenizer(pattern, len);
2960 yyTok = getToken();
2961#ifndef QT_NO_REGEXP_CAPTURE
2962 yyMayCapture = true;
2963#else
2964 yyMayCapture = false;
2965#endif
2966
2967#ifndef QT_NO_REGEXP_CAPTURE
2968 int atom = startAtom(false);
2969#endif
2970 QRegExpCharClass anything;
2971 Box box(this); // create InitialState
2972 box.set(anything);
2973 Box rightBox(this); // create FinalState
2974 rightBox.set(anything);
2975
2976 Box middleBox(this);
2977 parseExpression(&middleBox);
2978#ifndef QT_NO_REGEXP_CAPTURE
2979 finishAtom(atom, false);
2980#endif
2981#ifndef QT_NO_REGEXP_OPTIM
2982 middleBox.setupHeuristics();
2983#endif
2984 box.cat(middleBox);
2985 box.cat(rightBox);
2986 delete yyCharClass;
2987 yyCharClass = 0;
2988
2989#ifndef QT_NO_REGEXP_CAPTURE
2990 for (int i = 0; i < nf; ++i) {
2991 switch (f[i].capture) {
2992 case QRegExpAtom::NoCapture:
2993 break;
2994 case QRegExpAtom::OfficialCapture:
2995 f[i].capture = ncap;
2996 captureForOfficialCapture.append(ncap);
2997 ++ncap;
2998 ++officialncap;
2999 break;
3000 case QRegExpAtom::UnofficialCapture:
3001 f[i].capture = greedyQuantifiers ? ncap++ : QRegExpAtom::NoCapture;
3002 }
3003 }
3004
3005#ifndef QT_NO_REGEXP_BACKREF
3006#ifndef QT_NO_REGEXP_OPTIM
3007 if (officialncap == 0 && nbrefs == 0) {
3008 ncap = nf = 0;
3009 f.clear();
3010 }
3011#endif
3012 // handle the case where there's a \5 with no corresponding capture
3013 // (captureForOfficialCapture.size() != officialncap)
3014 for (int i = 0; i < nbrefs - officialncap; ++i) {
3015 captureForOfficialCapture.append(ncap);
3016 ++ncap;
3017 }
3018#endif
3019#endif
3020
3021 if (!yyError.isEmpty())
3022 return -1;
3023
3024#ifndef QT_NO_REGEXP_OPTIM
3025 const QRegExpAutomatonState &sinit = s.at(InitialState);
3026 caretAnchored = !sinit.anchors.isEmpty();
3027 if (caretAnchored) {
3028 const QMap<int, int> &anchors = sinit.anchors;
3029 QMap<int, int>::const_iterator a;
3030 for (a = anchors.constBegin(); a != anchors.constEnd(); ++a) {
3031 if (
3032#ifndef QT_NO_REGEXP_ANCHOR_ALT
3033 (*a & Anchor_Alternation) != 0 ||
3034#endif
3035 (*a & Anchor_Caret) == 0)
3036 {
3037 caretAnchored = false;
3038 break;
3039 }
3040 }
3041 }
3042#endif
3043
3044 // cleanup anchors
3045 int numStates = s.count();
3046 for (int i = 0; i < numStates; ++i) {
3047 QRegExpAutomatonState &state = s[i];
3048 if (!state.anchors.isEmpty()) {
3049 QMap<int, int>::iterator a = state.anchors.begin();
3050 while (a != state.anchors.end()) {
3051 if (a.value() == 0)
3052 a = state.anchors.erase(a);
3053 else
3054 ++a;
3055 }
3056 }
3057 }
3058
3059 return yyPos0;
3060}
3061
3062void QRegExpEngine::parseAtom(Box *box)
3063{
3064#ifndef QT_NO_REGEXP_LOOKAHEAD
3065 QRegExpEngine *eng = 0;
3066 bool neg;
3067 int len;
3068#endif
3069
3070 if ((yyTok & Tok_Char) != 0) {
3071 box->set(QChar(yyTok ^ Tok_Char));
3072 } else {
3073#ifndef QT_NO_REGEXP_OPTIM
3074 trivial = false;
3075#endif
3076 switch (yyTok) {
3077 case Tok_Dollar:
3078 box->catAnchor(Anchor_Dollar);
3079 break;
3080 case Tok_Caret:
3081 box->catAnchor(Anchor_Caret);
3082 break;
3083#ifndef QT_NO_REGEXP_LOOKAHEAD
3084 case Tok_PosLookahead:
3085 case Tok_NegLookahead:
3086 neg = (yyTok == Tok_NegLookahead);
3087 eng = new QRegExpEngine(cs, greedyQuantifiers);
3088 len = eng->parse(yyIn + yyPos - 1, yyLen - yyPos + 1);
3089 if (len >= 0)
3090 skipChars(len);
3091 else
3092 error(RXERR_LOOKAHEAD);
3093 box->catAnchor(addLookahead(eng, neg));
3094 yyTok = getToken();
3095 if (yyTok != Tok_RightParen)
3096 error(RXERR_LOOKAHEAD);
3097 break;
3098#endif
3099#ifndef QT_NO_REGEXP_ESCAPE
3100 case Tok_Word:
3101 box->catAnchor(Anchor_Word);
3102 break;
3103 case Tok_NonWord:
3104 box->catAnchor(Anchor_NonWord);
3105 break;
3106#endif
3107 case Tok_LeftParen:
3108 case Tok_MagicLeftParen:
3109 yyTok = getToken();
3110 parseExpression(box);
3111 if (yyTok != Tok_RightParen)
3112 error(RXERR_END);
3113 break;
3114 case Tok_CharClass:
3115 box->set(*yyCharClass);
3116 break;
3117 case Tok_Quantifier:
3118 error(RXERR_REPETITION);
3119 break;
3120 default:
3121#ifndef QT_NO_REGEXP_BACKREF
3122 if ((yyTok & Tok_BackRef) != 0)
3123 box->set(yyTok ^ Tok_BackRef);
3124 else
3125#endif
3126 error(RXERR_DISABLED);
3127 }
3128 }
3129 yyTok = getToken();
3130}
3131
3132void QRegExpEngine::parseFactor(Box *box)
3133{
3134#ifndef QT_NO_REGEXP_CAPTURE
3135 int outerAtom = greedyQuantifiers ? startAtom(false) : -1;
3136 int innerAtom = startAtom(yyMayCapture && yyTok == Tok_LeftParen);
3137 bool magicLeftParen = (yyTok == Tok_MagicLeftParen);
3138#else
3139 const int innerAtom = -1;
3140#endif
3141
3142#ifndef QT_NO_REGEXP_INTERVAL
3143#define YYREDO() \
3144 yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \
3145 *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok
3146
3147 const QChar *in = yyIn;
3148 int pos0 = yyPos0;
3149 int pos = yyPos;
3150 int len = yyLen;
3151 int ch = yyCh;
3152 QRegExpCharClass charClass;
3153 if (yyTok == Tok_CharClass)
3154 charClass = *yyCharClass;
3155 int tok = yyTok;
3156 bool mayCapture = yyMayCapture;
3157#endif
3158
3159 parseAtom(box);
3160#ifndef QT_NO_REGEXP_CAPTURE
3161 finishAtom(innerAtom, magicLeftParen);
3162#endif
3163
3164 bool hasQuantifier = (yyTok == Tok_Quantifier);
3165 if (hasQuantifier) {
3166#ifndef QT_NO_REGEXP_OPTIM
3167 trivial = false;
3168#endif
3169 if (yyMaxRep == InftyRep) {
3170 box->plus(innerAtom);
3171#ifndef QT_NO_REGEXP_INTERVAL
3172 } else if (yyMaxRep == 0) {
3173 box->clear();
3174#endif
3175 }
3176 if (yyMinRep == 0)
3177 box->opt();
3178
3179#ifndef QT_NO_REGEXP_INTERVAL
3180 yyMayCapture = false;
3181 int alpha = (yyMinRep == 0) ? 0 : yyMinRep - 1;
3182 int beta = (yyMaxRep == InftyRep) ? 0 : yyMaxRep - (alpha + 1);
3183
3184 Box rightBox(this);
3185 int i;
3186
3187 for (i = 0; i < beta; i++) {
3188 YYREDO();
3189 Box leftBox(this);
3190 parseAtom(&leftBox);
3191 leftBox.cat(rightBox);
3192 leftBox.opt();
3193 rightBox = leftBox;
3194 }
3195 for (i = 0; i < alpha; i++) {
3196 YYREDO();
3197 Box leftBox(this);
3198 parseAtom(&leftBox);
3199 leftBox.cat(rightBox);
3200 rightBox = leftBox;
3201 }
3202 rightBox.cat(*box);
3203 *box = rightBox;
3204#endif
3205 yyTok = getToken();
3206#ifndef QT_NO_REGEXP_INTERVAL
3207 yyMayCapture = mayCapture;
3208#endif
3209 }
3210#undef YYREDO
3211#ifndef QT_NO_REGEXP_CAPTURE
3212 if (greedyQuantifiers)
3213 finishAtom(outerAtom, hasQuantifier);
3214#endif
3215}
3216
3217void QRegExpEngine::parseTerm(Box *box)
3218{
3219#ifndef QT_NO_REGEXP_OPTIM
3220 if (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar)
3221 parseFactor(box);
3222#endif
3223 while (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) {
3224 Box rightBox(this);
3225 parseFactor(&rightBox);
3226 box->cat(rightBox);
3227 }
3228}
3229
3230void QRegExpEngine::parseExpression(Box *box)
3231{
3232 parseTerm(box);
3233 while (yyTok == Tok_Bar) {
3234#ifndef QT_NO_REGEXP_OPTIM
3235 trivial = false;
3236#endif
3237 Box rightBox(this);
3238 yyTok = getToken();
3239 parseTerm(&rightBox);
3240 box->orx(rightBox);
3241 }
3242}
3243
3244/*
3245 The struct QRegExpPrivate contains the private data of a regular
3246 expression other than the automaton. It makes it possible for many
3247 QRegExp objects to use the same QRegExpEngine object with different
3248 QRegExpPrivate objects.
3249*/
3250struct QRegExpPrivate
3251{
3252 QRegExpEngine *eng;
3253 QRegExpEngineKey engineKey;
3254 bool minimal;
3255#ifndef QT_NO_REGEXP_CAPTURE
3256 QString t; // last string passed to QRegExp::indexIn() or lastIndexIn()
3257 QStringList capturedCache; // what QRegExp::capturedTexts() returned last
3258#endif
3259 QRegExpMatchState matchState;
3260
3261 inline QRegExpPrivate()
3262 : eng(0), engineKey(QString(), QRegExp::RegExp, Qt::CaseSensitive), minimal(false) { }
3263 inline QRegExpPrivate(const QRegExpEngineKey &key)
3264 : eng(0), engineKey(key), minimal(false) {}
3265};
3266
3267#if !defined(QT_NO_REGEXP_OPTIM)
3268uint qHash(const QRegExpEngineKey &key)
3269{
3270 return qHash(key.pattern);
3271}
3272
3273typedef QCache<QRegExpEngineKey, QRegExpEngine> EngineCache;
3274Q_GLOBAL_STATIC(EngineCache, globalEngineCache)
3275Q_GLOBAL_STATIC(QMutex, mutex)
3276#endif // QT_NO_REGEXP_OPTIM
3277
3278static void derefEngine(QRegExpEngine *eng, const QRegExpEngineKey &key)
3279{
3280 if (!eng->ref.deref()) {
3281#if !defined(QT_NO_REGEXP_OPTIM)
3282 if (globalEngineCache()) {
3283 QMutexLocker locker(mutex());
3284 globalEngineCache()->insert(key, eng, 4 + key.pattern.length() / 4);
3285 }
3286 else
3287 delete eng;
3288#else
3289 Q_UNUSED(key);
3290 delete eng;
3291#endif
3292 }
3293}
3294
3295static void prepareEngine_helper(QRegExpPrivate *priv)
3296{
3297 bool initMatchState = !priv->eng;
3298#if !defined(QT_NO_REGEXP_OPTIM)
3299 if (!priv->eng) {
3300 QMutexLocker locker(mutex());
3301 priv->eng = globalEngineCache()->take(priv->engineKey);
3302 if (priv->eng != 0)
3303 priv->eng->ref.ref();
3304 }
3305#endif // QT_NO_REGEXP_OPTIM
3306
3307 if (!priv->eng)
3308 priv->eng = new QRegExpEngine(priv->engineKey);
3309
3310 if (initMatchState)
3311 priv->matchState.prepareForMatch(priv->eng);
3312}
3313
3314inline static void prepareEngine(QRegExpPrivate *priv)
3315{
3316 if (priv->eng)
3317 return;
3318 prepareEngine_helper(priv);
3319}
3320
3321static void prepareEngineForMatch(QRegExpPrivate *priv, const QString &str)
3322{
3323 prepareEngine(priv);
3324 priv->matchState.prepareForMatch(priv->eng);
3325#ifndef QT_NO_REGEXP_CAPTURE
3326 priv->t = str;
3327 priv->capturedCache.clear();
3328#else
3329 Q_UNUSED(str);
3330#endif
3331}
3332
3333static void invalidateEngine(QRegExpPrivate *priv)
3334{
3335 if (priv->eng != 0) {
3336 derefEngine(priv->eng, priv->engineKey);
3337 priv->eng = 0;
3338 priv->matchState.drain();
3339 }
3340}
3341
3342/*!
3343 \enum QRegExp::CaretMode
3344
3345 The CaretMode enum defines the different meanings of the caret
3346 (\bold{^}) in a regular expression. The possible values are:
3347
3348 \value CaretAtZero
3349 The caret corresponds to index 0 in the searched string.
3350
3351 \value CaretAtOffset
3352 The caret corresponds to the start offset of the search.
3353
3354 \value CaretWontMatch
3355 The caret never matches.
3356*/
3357
3358/*!
3359 \enum QRegExp::PatternSyntax
3360
3361 The syntax used to interpret the meaning of the pattern.
3362
3363 \value RegExp A rich Perl-like pattern matching syntax. This is
3364 the default.
3365
3366 \value RegExp2 Like RegExp, but with \l{greedy quantifiers}. This
3367 will be the default in Qt 5. (Introduced in Qt 4.2.)
3368
3369 \value Wildcard This provides a simple pattern matching syntax
3370 similar to that used by shells (command interpreters) for "file
3371 globbing". See \l{Wildcard Matching}.
3372
3373 \value FixedString The pattern is a fixed string. This is
3374 equivalent to using the RegExp pattern on a string in
3375 which all metacharacters are escaped using escape().
3376
3377 \sa setPatternSyntax()
3378*/
3379
3380/*!
3381 Constructs an empty regexp.
3382
3383 \sa isValid(), errorString()
3384*/
3385QRegExp::QRegExp()
3386{
3387 priv = new QRegExpPrivate;
3388}
3389
3390/*!
3391 Constructs a regular expression object for the given \a pattern
3392 string. The pattern must be given using wildcard notation if \a
3393 syntax is \l Wildcard; the default is \l RegExp. The pattern is
3394 case sensitive, unless \a cs is Qt::CaseInsensitive. Matching is
3395 greedy (maximal), but can be changed by calling
3396 setMinimal().
3397
3398 \sa setPattern(), setCaseSensitivity(), setPatternSyntax()
3399*/
3400QRegExp::QRegExp(const QString &pattern, Qt::CaseSensitivity cs, PatternSyntax syntax)
3401{
3402 priv = new QRegExpPrivate(QRegExpEngineKey(pattern, syntax, cs));
3403}
3404
3405/*!
3406 Constructs a regular expression as a copy of \a rx.
3407
3408 \sa operator=()
3409*/
3410QRegExp::QRegExp(const QRegExp &rx)
3411{
3412 priv = new QRegExpPrivate;
3413 operator=(rx);
3414}
3415
3416/*!
3417 Destroys the regular expression and cleans up its internal data.
3418*/
3419QRegExp::~QRegExp()
3420{
3421 invalidateEngine(priv);
3422 delete priv;
3423}
3424
3425/*!
3426 Copies the regular expression \a rx and returns a reference to the
3427 copy. The case sensitivity, wildcard, and minimal matching options
3428 are also copied.
3429*/
3430QRegExp &QRegExp::operator=(const QRegExp &rx)
3431{
3432 prepareEngine(rx.priv); // to allow sharing
3433 QRegExpEngine *otherEng = rx.priv->eng;
3434 if (otherEng)
3435 otherEng->ref.ref();
3436 invalidateEngine(priv);
3437 priv->eng = otherEng;
3438 priv->engineKey = rx.priv->engineKey;
3439 priv->minimal = rx.priv->minimal;
3440#ifndef QT_NO_REGEXP_CAPTURE
3441 priv->t = rx.priv->t;
3442 priv->capturedCache = rx.priv->capturedCache;
3443#endif
3444 if (priv->eng)
3445 priv->matchState.prepareForMatch(priv->eng);
3446 priv->matchState.captured = rx.priv->matchState.captured;
3447 return *this;
3448}
3449
3450/*!
3451 Returns true if this regular expression is equal to \a rx;
3452 otherwise returns false.
3453
3454 Two QRegExp objects are equal if they have the same pattern
3455 strings and the same settings for case sensitivity, wildcard and
3456 minimal matching.
3457*/
3458bool QRegExp::operator==(const QRegExp &rx) const
3459{
3460 return priv->engineKey == rx.priv->engineKey && priv->minimal == rx.priv->minimal;
3461}
3462
3463/*!
3464 \fn bool QRegExp::operator!=(const QRegExp &rx) const
3465
3466 Returns true if this regular expression is not equal to \a rx;
3467 otherwise returns false.
3468
3469 \sa operator==()
3470*/
3471
3472/*!
3473 Returns true if the pattern string is empty; otherwise returns
3474 false.
3475
3476 If you call exactMatch() with an empty pattern on an empty string
3477 it will return true; otherwise it returns false since it operates
3478 over the whole string. If you call indexIn() with an empty pattern
3479 on \e any string it will return the start offset (0 by default)
3480 because the empty pattern matches the 'emptiness' at the start of
3481 the string. In this case the length of the match returned by
3482 matchedLength() will be 0.
3483
3484 See QString::isEmpty().
3485*/
3486
3487bool QRegExp::isEmpty() const
3488{
3489 return priv->engineKey.pattern.isEmpty();
3490}
3491
3492/*!
3493 Returns true if the regular expression is valid; otherwise returns
3494 false. An invalid regular expression never matches.
3495
3496 The pattern \bold{[a-z} is an example of an invalid pattern, since
3497 it lacks a closing square bracket.
3498
3499 Note that the validity of a regexp may also depend on the setting
3500 of the wildcard flag, for example \bold{*.html} is a valid
3501 wildcard regexp but an invalid full regexp.
3502
3503 \sa errorString()
3504*/
3505bool QRegExp::isValid() const
3506{
3507 if (priv->engineKey.pattern.isEmpty()) {
3508 return true;
3509 } else {
3510 prepareEngine(priv);
3511 return priv->eng->isValid();
3512 }
3513}
3514
3515/*!
3516 Returns the pattern string of the regular expression. The pattern
3517 has either regular expression syntax or wildcard syntax, depending
3518 on patternSyntax().
3519
3520 \sa patternSyntax(), caseSensitivity()
3521*/
3522QString QRegExp::pattern() const
3523{
3524 return priv->engineKey.pattern;
3525}
3526
3527/*!
3528 Sets the pattern string to \a pattern. The case sensitivity,
3529 wildcard, and minimal matching options are not changed.
3530
3531 \sa setPatternSyntax(), setCaseSensitivity()
3532*/
3533void QRegExp::setPattern(const QString &pattern)
3534{
3535 if (priv->engineKey.pattern != pattern) {
3536 invalidateEngine(priv);
3537 priv->engineKey.pattern = pattern;
3538 }
3539}
3540
3541/*!
3542 Returns Qt::CaseSensitive if the regexp is matched case
3543 sensitively; otherwise returns Qt::CaseInsensitive.
3544
3545 \sa patternSyntax(), pattern(), isMinimal()
3546*/
3547Qt::CaseSensitivity QRegExp::caseSensitivity() const
3548{
3549 return priv->engineKey.cs;
3550}
3551
3552/*!
3553 Sets case sensitive matching to \a cs.
3554
3555 If \a cs is Qt::CaseSensitive, \bold{\\.txt$} matches
3556 \c{readme.txt} but not \c{README.TXT}.
3557
3558 \sa setPatternSyntax(), setPattern(), setMinimal()
3559*/
3560void QRegExp::setCaseSensitivity(Qt::CaseSensitivity cs)
3561{
3562 if ((bool)cs != (bool)priv->engineKey.cs) {
3563 invalidateEngine(priv);
3564 priv->engineKey.cs = cs;
3565 }
3566}
3567
3568/*!
3569 Returns the syntax used by the regular expression. The default is
3570 QRegExp::RegExp.
3571
3572 \sa pattern(), caseSensitivity()
3573*/
3574QRegExp::PatternSyntax QRegExp::patternSyntax() const
3575{
3576 return priv->engineKey.patternSyntax;
3577}
3578
3579/*!
3580 Sets the syntax mode for the regular expression. The default is
3581 QRegExp::RegExp.
3582
3583 Setting \a syntax to QRegExp::Wildcard enables simple shell-like
3584 \l{wildcard matching}. For example, \bold{r*.txt} matches the
3585 string \c{readme.txt} in wildcard mode, but does not match
3586 \c{readme}.
3587
3588 Setting \a syntax to QRegExp::FixedString means that the pattern
3589 is interpreted as a plain string. Special characters (e.g.,
3590 backslash) don't need to be escaped then.
3591
3592 \sa setPattern(), setCaseSensitivity(), escape()
3593*/
3594void QRegExp::setPatternSyntax(PatternSyntax syntax)
3595{
3596 if (syntax != priv->engineKey.patternSyntax) {
3597 invalidateEngine(priv);
3598 priv->engineKey.patternSyntax = syntax;
3599 }
3600}
3601
3602/*!
3603 Returns true if minimal (non-greedy) matching is enabled;
3604 otherwise returns false.
3605
3606 \sa caseSensitivity(), setMinimal()
3607*/
3608bool QRegExp::isMinimal() const
3609{
3610 return priv->minimal;
3611}
3612
3613/*!
3614 Enables or disables minimal matching. If \a minimal is false,
3615 matching is greedy (maximal) which is the default.
3616
3617 For example, suppose we have the input string "We must be
3618 <b>bold</b>, very <b>bold</b>!" and the pattern
3619 \bold{<b>.*</b>}. With the default greedy (maximal) matching,
3620 the match is "We must be \underline{<b>bold</b>, very
3621 <b>bold</b>}!". But with minimal (non-greedy) matching, the
3622 first match is: "We must be \underline{<b>bold</b>}, very
3623 <b>bold</b>!" and the second match is "We must be <b>bold</b>,
3624 very \underline{<b>bold</b>}!". In practice we might use the pattern
3625 \bold{<b>[^<]*\</b>} instead, although this will still fail for
3626 nested tags.
3627
3628 \sa setCaseSensitivity()
3629*/
3630void QRegExp::setMinimal(bool minimal)
3631{
3632 priv->minimal = minimal;
3633}
3634
3635// ### Qt 5: make non-const
3636/*!
3637 Returns true if \a str is matched exactly by this regular
3638 expression; otherwise returns false. You can determine how much of
3639 the string was matched by calling matchedLength().
3640
3641 For a given regexp string R, exactMatch("R") is the equivalent of
3642 indexIn("^R$") since exactMatch() effectively encloses the regexp
3643 in the start of string and end of string anchors, except that it
3644 sets matchedLength() differently.
3645
3646 For example, if the regular expression is \bold{blue}, then
3647 exactMatch() returns true only for input \c blue. For inputs \c
3648 bluebell, \c blutak and \c lightblue, exactMatch() returns false
3649 and matchedLength() will return 4, 3 and 0 respectively.
3650
3651 Although const, this function sets matchedLength(),
3652 capturedTexts(), and pos().
3653
3654 \sa indexIn(), lastIndexIn()
3655*/
3656bool QRegExp::exactMatch(const QString &str) const
3657{
3658 prepareEngineForMatch(priv, str);
3659 priv->matchState.match(str.unicode(), str.length(), 0, priv->minimal, true, 0);
3660 if (priv->matchState.captured[1] == str.length()) {
3661 return true;
3662 } else {
3663 priv->matchState.captured[0] = 0;
3664 priv->matchState.captured[1] = priv->matchState.oneTestMatchedLen;
3665 return false;
3666 }
3667}
3668
3669// ### Qt 5: make non-const
3670/*!
3671 Attempts to find a match in \a str from position \a offset (0 by
3672 default). If \a offset is -1, the search starts at the last
3673 character; if -2, at the next to last character; etc.
3674
3675 Returns the position of the first match, or -1 if there was no
3676 match.
3677
3678 The \a caretMode parameter can be used to instruct whether \bold{^}
3679 should match at index 0 or at \a offset.
3680
3681 You might prefer to use QString::indexOf(), QString::contains(),
3682 or even QStringList::filter(). To replace matches use
3683 QString::replace().
3684
3685 Example:
3686 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 13
3687
3688 Although const, this function sets matchedLength(),
3689 capturedTexts() and pos().
3690
3691 If the QRegExp is a wildcard expression (see setPatternSyntax())
3692 and want to test a string against the whole wildcard expression,
3693 use exactMatch() instead of this function.
3694
3695 \sa lastIndexIn(), exactMatch()
3696*/
3697
3698int QRegExp::indexIn(const QString &str, int offset, CaretMode caretMode) const
3699{
3700 prepareEngineForMatch(priv, str);
3701 if (offset < 0)
3702 offset += str.length();
3703 priv->matchState.match(str.unicode(), str.length(), offset,
3704 priv->minimal, false, caretIndex(offset, caretMode));
3705 return priv->matchState.captured[0];
3706}
3707
3708// ### Qt 5: make non-const
3709/*!
3710 Attempts to find a match backwards in \a str from position \a
3711 offset. If \a offset is -1 (the default), the search starts at the
3712 last character; if -2, at the next to last character; etc.
3713
3714 Returns the position of the first match, or -1 if there was no
3715 match.
3716
3717 The \a caretMode parameter can be used to instruct whether \bold{^}
3718 should match at index 0 or at \a offset.
3719
3720 Although const, this function sets matchedLength(),
3721 capturedTexts() and pos().
3722
3723 \warning Searching backwards is much slower than searching
3724 forwards.
3725
3726 \sa indexIn(), exactMatch()
3727*/
3728
3729int QRegExp::lastIndexIn(const QString &str, int offset, CaretMode caretMode) const
3730{
3731 prepareEngineForMatch(priv, str);
3732 if (offset < 0)
3733 offset += str.length();
3734 if (offset < 0 || offset > str.length()) {
3735 memset(priv->matchState.captured, -1, priv->matchState.capturedSize*sizeof(int));
3736 return -1;
3737 }
3738
3739 while (offset >= 0) {
3740 priv->matchState.match(str.unicode(), str.length(), offset,
3741 priv->minimal, true, caretIndex(offset, caretMode));
3742 if (priv->matchState.captured[0] == offset)
3743 return offset;
3744 --offset;
3745 }
3746 return -1;
3747}
3748
3749/*!
3750 Returns the length of the last matched string, or -1 if there was
3751 no match.
3752
3753 \sa exactMatch(), indexIn(), lastIndexIn()
3754*/
3755int QRegExp::matchedLength() const
3756{
3757 return priv->matchState.captured[1];
3758}
3759
3760#ifndef QT_NO_REGEXP_CAPTURE
3761/*!
3762 Returns the number of captures contained in the regular expression.
3763 */
3764int QRegExp::numCaptures() const
3765{
3766 prepareEngine(priv);
3767 return priv->eng->numCaptures();
3768}
3769
3770/*!
3771 Returns a list of the captured text strings.
3772
3773 The first string in the list is the entire matched string. Each
3774 subsequent list element contains a string that matched a
3775 (capturing) subexpression of the regexp.
3776
3777 For example:
3778 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 14
3779
3780 The above example also captures elements that may be present but
3781 which we have no interest in. This problem can be solved by using
3782 non-capturing parentheses:
3783
3784 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 15
3785
3786 Note that if you want to iterate over the list, you should iterate
3787 over a copy, e.g.
3788 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 16
3789
3790 Some regexps can match an indeterminate number of times. For
3791 example if the input string is "Offsets: 12 14 99 231 7" and the
3792 regexp, \c{rx}, is \bold{(\\d+)+}, we would hope to get a list of
3793 all the numbers matched. However, after calling
3794 \c{rx.indexIn(str)}, capturedTexts() will return the list ("12",
3795 "12"), i.e. the entire match was "12" and the first subexpression
3796 matched was "12". The correct approach is to use cap() in a
3797 \l{QRegExp#cap_in_a_loop}{loop}.
3798
3799 The order of elements in the string list is as follows. The first
3800 element is the entire matching string. Each subsequent element
3801 corresponds to the next capturing open left parentheses. Thus
3802 capturedTexts()[1] is the text of the first capturing parentheses,
3803 capturedTexts()[2] is the text of the second and so on
3804 (corresponding to $1, $2, etc., in some other regexp languages).
3805
3806 \sa cap(), pos()
3807*/
3808QStringList QRegExp::capturedTexts() const
3809{
3810 if (priv->capturedCache.isEmpty()) {
3811 prepareEngine(priv);
3812 const int *captured = priv->matchState.captured;
3813 int n = priv->matchState.capturedSize;
3814
3815 for (int i = 0; i < n; i += 2) {
3816 QString m;
3817 if (captured[i + 1] == 0)
3818 m = QLatin1String(""); // ### Qt 5: don't distinguish between null and empty
3819 else if (captured[i] >= 0)
3820 m = priv->t.mid(captured[i], captured[i + 1]);
3821 priv->capturedCache.append(m);
3822 }
3823 priv->t.clear();
3824 }
3825 return priv->capturedCache;
3826}
3827
3828/*!
3829 \internal
3830*/
3831QStringList QRegExp::capturedTexts()
3832{
3833 return const_cast<const QRegExp *>(this)->capturedTexts();
3834}
3835
3836/*!
3837 Returns the text captured by the \a nth subexpression. The entire
3838 match has index 0 and the parenthesized subexpressions have
3839 indexes starting from 1 (excluding non-capturing parentheses).
3840
3841 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 17
3842
3843 The order of elements matched by cap() is as follows. The first
3844 element, cap(0), is the entire matching string. Each subsequent
3845 element corresponds to the next capturing open left parentheses.
3846 Thus cap(1) is the text of the first capturing parentheses, cap(2)
3847 is the text of the second, and so on.
3848
3849 \sa capturedTexts(), pos()
3850*/
3851QString QRegExp::cap(int nth) const
3852{
3853 return capturedTexts().value(nth);
3854}
3855
3856/*!
3857 \internal
3858*/
3859QString QRegExp::cap(int nth)
3860{
3861 return const_cast<const QRegExp *>(this)->cap(nth);
3862}
3863
3864/*!
3865 Returns the position of the \a nth captured text in the searched
3866 string. If \a nth is 0 (the default), pos() returns the position
3867 of the whole match.
3868
3869 Example:
3870 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 18
3871
3872 For zero-length matches, pos() always returns -1. (For example, if
3873 cap(4) would return an empty string, pos(4) returns -1.) This is
3874 a feature of the implementation.
3875
3876 \sa cap(), capturedTexts()
3877*/
3878int QRegExp::pos(int nth) const
3879{
3880 if (nth < 0 || nth >= priv->matchState.capturedSize / 2)
3881 return -1;
3882 else
3883 return priv->matchState.captured[2 * nth];
3884}
3885
3886/*!
3887 \internal
3888*/
3889int QRegExp::pos(int nth)
3890{
3891 return const_cast<const QRegExp *>(this)->pos(nth);
3892}
3893
3894/*!
3895 Returns a text string that explains why a regexp pattern is
3896 invalid the case being; otherwise returns "no error occurred".
3897
3898 \sa isValid()
3899*/
3900QString QRegExp::errorString() const
3901{
3902 if (isValid()) {
3903 return QString::fromLatin1(RXERR_OK);
3904 } else {
3905 return priv->eng->errorString();
3906 }
3907}
3908
3909/*!
3910 \internal
3911*/
3912QString QRegExp::errorString()
3913{
3914 return const_cast<const QRegExp *>(this)->errorString();
3915}
3916#endif
3917
3918/*!
3919 Returns the string \a str with every regexp special character
3920 escaped with a backslash. The special characters are $, (,), *, +,
3921 ., ?, [, \,], ^, {, | and }.
3922
3923 Example:
3924
3925 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 19
3926
3927 This function is useful to construct regexp patterns dynamically:
3928
3929 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 20
3930
3931 \sa setPatternSyntax()
3932*/
3933QString QRegExp::escape(const QString &str)
3934{
3935 QString quoted;
3936 const int count = str.count();
3937 quoted.reserve(count * 2);
3938 const QLatin1Char backslash('\\');
3939 for (int i = 0; i < count; i++) {
3940 switch (str.at(i).toLatin1()) {
3941 case '$':
3942 case '(':
3943 case ')':
3944 case '*':
3945 case '+':
3946 case '.':
3947 case '?':
3948 case '[':
3949 case '\\':
3950 case ']':
3951 case '^':
3952 case '{':
3953 case '|':
3954 case '}':
3955 quoted.append(backslash);
3956 }
3957 quoted.append(str.at(i));
3958 }
3959 return quoted;
3960}
3961
3962/*!
3963 \fn bool QRegExp::caseSensitive() const
3964
3965 Use \l caseSensitivity() instead.
3966*/
3967
3968/*!
3969 \fn void QRegExp::setCaseSensitive(bool sensitive)
3970
3971 Use \l setCaseSensitivity() instead.
3972*/
3973
3974/*!
3975 \fn bool QRegExp::wildcard() const
3976
3977 Use \l patternSyntax() instead.
3978
3979 \oldcode
3980 bool wc = rx.wildcard();
3981 \newcode
3982 bool wc = (rx.patternSyntax() == QRegExp::Wildcard);
3983 \endcode
3984*/
3985
3986/*!
3987 \fn void QRegExp::setWildcard(bool wildcard)
3988
3989 Use \l setPatternSyntax() instead.
3990
3991 \oldcode
3992 rx.setWildcard(wc);
3993 \newcode
3994 rx.setPatternSyntax(wc ? QRegExp::Wildcard : QRegExp::RegExp);
3995 \endcode
3996*/
3997
3998/*!
3999 \fn bool QRegExp::minimal() const
4000
4001 Use \l isMinimal() instead.
4002*/
4003
4004/*!
4005 \fn int QRegExp::search(const QString &str, int from = 0,
4006 CaretMode caretMode = CaretAtZero) const
4007
4008 Use \l indexIn() instead.
4009*/
4010
4011/*!
4012 \fn int QRegExp::searchRev(const QString &str, int from = -1, \
4013 CaretMode caretMode = CaretAtZero) const
4014
4015 Use \l lastIndexIn() instead.
4016*/
4017
4018/*!
4019 \fn QRegExp::QRegExp(const QString &pattern, bool cs, bool wildcard = false)
4020
4021 Use another constructor instead.
4022
4023 \oldcode
4024 QRegExp rx("*.txt", false, true);
4025 \newcode
4026 QRegExp rx("*.txt", Qt::CaseInsensitive, QRegExp::Wildcard);
4027 \endcode
4028*/
4029
4030#ifndef QT_NO_DATASTREAM
4031/*!
4032 \relates QRegExp
4033
4034 Writes the regular expression \a regExp to stream \a out.
4035
4036 \sa {Format of the QDataStream Operators}
4037*/
4038QDataStream &operator<<(QDataStream &out, const QRegExp &regExp)
4039{
4040 return out << regExp.pattern() << (quint8)regExp.caseSensitivity()
4041 << (quint8)regExp.patternSyntax()
4042 << (quint8)!!regExp.isMinimal();
4043}
4044
4045/*!
4046 \relates QRegExp
4047
4048 Reads a regular expression from stream \a in into \a regExp.
4049
4050 \sa {Format of the QDataStream Operators}
4051*/
4052QDataStream &operator>>(QDataStream &in, QRegExp &regExp)
4053{
4054 QString pattern;
4055 quint8 cs;
4056 quint8 patternSyntax;
4057 quint8 isMinimal;
4058
4059 in >> pattern >> cs >> patternSyntax >> isMinimal;
4060
4061 QRegExp newRegExp(pattern, Qt::CaseSensitivity(cs),
4062 QRegExp::PatternSyntax(patternSyntax));
4063
4064 newRegExp.setMinimal(isMinimal);
4065 regExp = newRegExp;
4066 return in;
4067}
4068#endif
4069
4070QT_END_NAMESPACE
Note: See TracBrowser for help on using the repository browser.