1 | /****************************************************************************
|
---|
2 | **
|
---|
3 | ** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
|
---|
4 | ** All rights reserved.
|
---|
5 | ** Contact: Nokia Corporation ([email protected])
|
---|
6 | **
|
---|
7 | ** This file is part of the QtCore module of the Qt Toolkit.
|
---|
8 | **
|
---|
9 | ** $QT_BEGIN_LICENSE:LGPL$
|
---|
10 | ** Commercial Usage
|
---|
11 | ** Licensees holding valid Qt Commercial licenses may use this file in
|
---|
12 | ** accordance with the Qt Commercial License Agreement provided with the
|
---|
13 | ** Software or, alternatively, in accordance with the terms contained in
|
---|
14 | ** a written agreement between you and Nokia.
|
---|
15 | **
|
---|
16 | ** GNU Lesser General Public License Usage
|
---|
17 | ** Alternatively, this file may be used under the terms of the GNU Lesser
|
---|
18 | ** General Public License version 2.1 as published by the Free Software
|
---|
19 | ** Foundation and appearing in the file LICENSE.LGPL included in the
|
---|
20 | ** packaging of this file. Please review the following information to
|
---|
21 | ** ensure the GNU Lesser General Public License version 2.1 requirements
|
---|
22 | ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
|
---|
23 | **
|
---|
24 | ** In addition, as a special exception, Nokia gives you certain additional
|
---|
25 | ** rights. These rights are described in the Nokia Qt LGPL Exception
|
---|
26 | ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
|
---|
27 | **
|
---|
28 | ** GNU General Public License Usage
|
---|
29 | ** Alternatively, this file may be used under the terms of the GNU
|
---|
30 | ** General Public License version 3.0 as published by the Free Software
|
---|
31 | ** Foundation and appearing in the file LICENSE.GPL included in the
|
---|
32 | ** packaging of this file. Please review the following information to
|
---|
33 | ** ensure the GNU General Public License version 3.0 requirements will be
|
---|
34 | ** met: http://www.gnu.org/copyleft/gpl.html.
|
---|
35 | **
|
---|
36 | ** If you have questions regarding the use of this file, please contact
|
---|
37 | ** Nokia at [email protected].
|
---|
38 | ** $QT_END_LICENSE$
|
---|
39 | **
|
---|
40 | ****************************************************************************/
|
---|
41 |
|
---|
42 | #include "qregexp.h"
|
---|
43 |
|
---|
44 | #include "qalgorithms.h"
|
---|
45 | #include "qbitarray.h"
|
---|
46 | #include "qcache.h"
|
---|
47 | #include "qdatastream.h"
|
---|
48 | #include "qlist.h"
|
---|
49 | #include "qmap.h"
|
---|
50 | #include "qmutex.h"
|
---|
51 | #include "qstring.h"
|
---|
52 | #include "qstringlist.h"
|
---|
53 | #include "qstringmatcher.h"
|
---|
54 | #include "qvector.h"
|
---|
55 | #include "private/qfunctions_p.h"
|
---|
56 |
|
---|
57 | #include <limits.h>
|
---|
58 |
|
---|
59 | QT_BEGIN_NAMESPACE
|
---|
60 |
|
---|
61 | int qFindString(const QChar *haystack, int haystackLen, int from,
|
---|
62 | const QChar *needle, int needleLen, Qt::CaseSensitivity cs);
|
---|
63 |
|
---|
64 | // error strings for the regexp parser
|
---|
65 | #define RXERR_OK QT_TRANSLATE_NOOP("QRegExp", "no error occurred")
|
---|
66 | #define RXERR_DISABLED QT_TRANSLATE_NOOP("QRegExp", "disabled feature used")
|
---|
67 | #define RXERR_CHARCLASS QT_TRANSLATE_NOOP("QRegExp", "bad char class syntax")
|
---|
68 | #define RXERR_LOOKAHEAD QT_TRANSLATE_NOOP("QRegExp", "bad lookahead syntax")
|
---|
69 | #define RXERR_REPETITION QT_TRANSLATE_NOOP("QRegExp", "bad repetition syntax")
|
---|
70 | #define RXERR_OCTAL QT_TRANSLATE_NOOP("QRegExp", "invalid octal value")
|
---|
71 | #define RXERR_LEFTDELIM QT_TRANSLATE_NOOP("QRegExp", "missing left delim")
|
---|
72 | #define RXERR_END QT_TRANSLATE_NOOP("QRegExp", "unexpected end")
|
---|
73 | #define RXERR_LIMIT QT_TRANSLATE_NOOP("QRegExp", "met internal limit")
|
---|
74 | #define RXERR_INTERVAL QT_TRANSLATE_NOOP("QRegExp", "invalid interval")
|
---|
75 | #define RXERR_CATEGORY QT_TRANSLATE_NOOP("QRegExp", "invalid category")
|
---|
76 |
|
---|
77 | /*
|
---|
78 | WARNING! Be sure to read qregexp.tex before modifying this file.
|
---|
79 | */
|
---|
80 |
|
---|
81 | /*!
|
---|
82 | \class QRegExp
|
---|
83 | \reentrant
|
---|
84 | \brief The QRegExp class provides pattern matching using regular expressions.
|
---|
85 |
|
---|
86 | \ingroup tools
|
---|
87 | \ingroup shared
|
---|
88 |
|
---|
89 | \keyword regular expression
|
---|
90 |
|
---|
91 | A regular expression, or "regexp", is a pattern for matching
|
---|
92 | substrings in a text. This is useful in many contexts, e.g.,
|
---|
93 |
|
---|
94 | \table
|
---|
95 | \row \i Validation
|
---|
96 | \i A regexp can test whether a substring meets some criteria,
|
---|
97 | e.g. is an integer or contains no whitespace.
|
---|
98 | \row \i Searching
|
---|
99 | \i A regexp provides more powerful pattern matching than
|
---|
100 | simple substring matching, e.g., match one of the words
|
---|
101 | \e{mail}, \e{letter} or \e{correspondence}, but none of the
|
---|
102 | words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc.
|
---|
103 | \row \i Search and Replace
|
---|
104 | \i A regexp can replace all occurrences of a substring with a
|
---|
105 | different substring, e.g., replace all occurrences of \e{&}
|
---|
106 | with \e{\&} except where the \e{&} is already followed by
|
---|
107 | an \e{amp;}.
|
---|
108 | \row \i String Splitting
|
---|
109 | \i A regexp can be used to identify where a string should be
|
---|
110 | split apart, e.g. splitting tab-delimited strings.
|
---|
111 | \endtable
|
---|
112 |
|
---|
113 | A brief introduction to regexps is presented, a description of
|
---|
114 | Qt's regexp language, some examples, and the function
|
---|
115 | documentation itself. QRegExp is modeled on Perl's regexp
|
---|
116 | language. It fully supports Unicode. QRegExp can also be used in a
|
---|
117 | simpler, \e{wildcard mode} that is similar to the functionality
|
---|
118 | found in command shells. The syntax rules used by QRegExp can be
|
---|
119 | changed with setPatternSyntax(). In particular, the pattern syntax
|
---|
120 | can be set to QRegExp::FixedString, which means the pattern to be
|
---|
121 | matched is interpreted as a plain string, i.e., special characters
|
---|
122 | (e.g., backslash) are not escaped.
|
---|
123 |
|
---|
124 | A good text on regexps is \e {Mastering Regular Expressions}
|
---|
125 | (Third Edition) by Jeffrey E. F. Friedl, ISBN 0-596-52812-4.
|
---|
126 |
|
---|
127 | \tableofcontents
|
---|
128 |
|
---|
129 | \section1 Introduction
|
---|
130 |
|
---|
131 | Regexps are built up from expressions, quantifiers, and
|
---|
132 | assertions. The simplest expression is a character, e.g. \bold{x}
|
---|
133 | or \bold{5}. An expression can also be a set of characters
|
---|
134 | enclosed in square brackets. \bold{[ABCD]} will match an \bold{A}
|
---|
135 | or a \bold{B} or a \bold{C} or a \bold{D}. We can write this same
|
---|
136 | expression as \bold{[A-D]}, and an experession to match any
|
---|
137 | captital letter in the English alphabet is written as
|
---|
138 | \bold{[A-Z]}.
|
---|
139 |
|
---|
140 | A quantifier specifies the number of occurrences of an expression
|
---|
141 | that must be matched. \bold{x{1,1}} means match one and only one
|
---|
142 | \bold{x}. \bold{x{1,5}} means match a sequence of \bold{x}
|
---|
143 | characters that contains at least one \bold{x} but no more than
|
---|
144 | five.
|
---|
145 |
|
---|
146 | Note that in general regexps cannot be used to check for balanced
|
---|
147 | brackets or tags. For example, a regexp can be written to match an
|
---|
148 | opening html \c{<b>} and its closing \c{</b>}, if the \c{<b>} tags
|
---|
149 | are not nested, but if the \c{<b>} tags are nested, that same
|
---|
150 | regexp will match an opening \c{<b>} tag with the wrong closing
|
---|
151 | \c{</b>}. For the fragment \c{<b>bold <b>bolder</b></b>}, the
|
---|
152 | first \c{<b>} would be matched with the first \c{</b>}, which is
|
---|
153 | not correct. However, it is possible to write a regexp that will
|
---|
154 | match nested brackets or tags correctly, but only if the number of
|
---|
155 | nesting levels is fixed and known. If the number of nesting levels
|
---|
156 | is not fixed and known, it is impossible to write a regexp that
|
---|
157 | will not fail.
|
---|
158 |
|
---|
159 | Suppose we want a regexp to match integers in the range 0 to 99.
|
---|
160 | At least one digit is required, so we start with the expression
|
---|
161 | \bold{[0-9]{1,1}}, which matches a single digit exactly once. This
|
---|
162 | regexp matches integers in the range 0 to 9. To match integers up
|
---|
163 | to 99, increase the maximum number of occurrences to 2, so the
|
---|
164 | regexp becomes \bold{[0-9]{1,2}}. This regexp satisfies the
|
---|
165 | original requirement to match integers from 0 to 99, but it will
|
---|
166 | also match integers that occur in the middle of strings. If we
|
---|
167 | want the matched integer to be the whole string, we must use the
|
---|
168 | anchor assertions, \bold{^} (caret) and \bold{$} (dollar). When
|
---|
169 | \bold{^} is the first character in a regexp, it means the regexp
|
---|
170 | must match from the beginning of the string. When \bold{$} is the
|
---|
171 | last character of the regexp, it means the regexp must match to
|
---|
172 | the end of the string. The regexp becomes \bold{^[0-9]{1,2}$}.
|
---|
173 | Note that assertions, e.g. \bold{^} and \bold{$}, do not match
|
---|
174 | characters but locations in the string.
|
---|
175 |
|
---|
176 | If you have seen regexps described elsewhere, they may have looked
|
---|
177 | different from the ones shown here. This is because some sets of
|
---|
178 | characters and some quantifiers are so common that they have been
|
---|
179 | given special symbols to represent them. \bold{[0-9]} can be
|
---|
180 | replaced with the symbol \bold{\\d}. The quantifier to match
|
---|
181 | exactly one occurrence, \bold{{1,1}}, can be replaced with the
|
---|
182 | expression itself, i.e. \bold{x{1,1}} is the same as \bold{x}. So
|
---|
183 | our 0 to 99 matcher could be written as \bold{^\\d{1,2}$}. It can
|
---|
184 | also be written \bold{^\\d\\d{0,1}$}, i.e. \e{From the start of
|
---|
185 | the string, match a digit, followed immediately by 0 or 1 digits}.
|
---|
186 | In practice, it would be written as \bold{^\\d\\d?$}. The \bold{?}
|
---|
187 | is shorthand for the quantifier \bold{{0,1}}, i.e. 0 or 1
|
---|
188 | occurrences. \bold{?} makes an expression optional. The regexp
|
---|
189 | \bold{^\\d\\d?$} means \e{From the beginning of the string, match
|
---|
190 | one digit, followed immediately by 0 or 1 more digit, followed
|
---|
191 | immediately by end of string}.
|
---|
192 |
|
---|
193 | To write a regexp that matches one of the words 'mail' \e or
|
---|
194 | 'letter' \e or 'correspondence' but does not match words that
|
---|
195 | contain these words, e.g., 'email', 'mailman', 'mailer', and
|
---|
196 | 'letterbox', start with a regexp that matches 'mail'. Expressed
|
---|
197 | fully, the regexp is \bold{m{1,1}a{1,1}i{1,1}l{1,1}}, but because
|
---|
198 | a character expression is automatically quantified by
|
---|
199 | \bold{{1,1}}, we can simplify the regexp to \bold{mail}, i.e., an
|
---|
200 | 'm' followed by an 'a' followed by an 'i' followed by an 'l'. Now
|
---|
201 | we can use the vertical bar \bold{|}, which means \bold{or}, to
|
---|
202 | include the other two words, so our regexp for matching any of the
|
---|
203 | three words becomes \bold{mail|letter|correspondence}. Match
|
---|
204 | 'mail' \bold{or} 'letter' \bold{or} 'correspondence'. While this
|
---|
205 | regexp will match one of the three words we want to match, it will
|
---|
206 | also match words we don't want to match, e.g., 'email'. To
|
---|
207 | prevent the regexp from matching unwanted words, we must tell it
|
---|
208 | to begin and end the match at word boundaries. First we enclose
|
---|
209 | our regexp in parentheses, \bold{(mail|letter|correspondence)}.
|
---|
210 | Parentheses group expressions together, and they identify a part
|
---|
211 | of the regexp that we wish to \l{capturing text}{capture}.
|
---|
212 | Enclosing the expression in parentheses allows us to use it as a
|
---|
213 | component in more complex regexps. It also allows us to examine
|
---|
214 | which of the three words was actually matched. To force the match
|
---|
215 | to begin and end on word boundaries, we enclose the regexp in
|
---|
216 | \bold{\\b} \e{word boundary} assertions:
|
---|
217 | \bold{\\b(mail|letter|correspondence)\\b}. Now the regexp means:
|
---|
218 | \e{Match a word boundary, followed by the regexp in parentheses,
|
---|
219 | followed by a word boundary}. The \bold{\\b} assertion matches a
|
---|
220 | \e position in the regexp, not a \e character. A word boundary is
|
---|
221 | any non-word character, e.g., a space, newline, or the beginning
|
---|
222 | or ending of a string.
|
---|
223 |
|
---|
224 | If we want to replace ampersand characters with the HTML entity
|
---|
225 | \bold{\&}, the regexp to match is simply \bold{\&}. But this
|
---|
226 | regexp will also match ampersands that have already been converted
|
---|
227 | to HTML entities. We want to replace only ampersands that are not
|
---|
228 | already followed by \bold{amp;}. For this, we need the negative
|
---|
229 | lookahead assertion, \bold{(?!}__\bold{)}. The regexp can then be
|
---|
230 | written as \bold{\&(?!amp;)}, i.e. \e{Match an ampersand that is}
|
---|
231 | \bold{not} \e{followed by} \bold{amp;}.
|
---|
232 |
|
---|
233 | If we want to count all the occurrences of 'Eric' and 'Eirik' in a
|
---|
234 | string, two valid solutions are \bold{\\b(Eric|Eirik)\\b} and
|
---|
235 | \bold{\\bEi?ri[ck]\\b}. The word boundary assertion '\\b' is
|
---|
236 | required to avoid matching words that contain either name,
|
---|
237 | e.g. 'Ericsson'. Note that the second regexp matches more
|
---|
238 | spellings than we want: 'Eric', 'Erik', 'Eiric' and 'Eirik'.
|
---|
239 |
|
---|
240 | Some of the examples discussed above are implemented in the
|
---|
241 | \link #code-examples code examples \endlink section.
|
---|
242 |
|
---|
243 | \target characters-and-abbreviations-for-sets-of-characters
|
---|
244 | \section1 Characters and Abbreviations for Sets of Characters
|
---|
245 |
|
---|
246 | \table
|
---|
247 | \header \i Element \i Meaning
|
---|
248 | \row \i \bold{c}
|
---|
249 | \i A character represents itself unless it has a special
|
---|
250 | regexp meaning. e.g. \bold{c} matches the character \e c.
|
---|
251 | \row \i \bold{\\c}
|
---|
252 | \i A character that follows a backslash matches the character
|
---|
253 | itself, except as specified below. e.g., To match a literal
|
---|
254 | caret at the beginning of a string, write \bold{\\^}.
|
---|
255 | \row \i \bold{\\a}
|
---|
256 | \i Matches the ASCII bell (BEL, 0x07).
|
---|
257 | \row \i \bold{\\f}
|
---|
258 | \i Matches the ASCII form feed (FF, 0x0C).
|
---|
259 | \row \i \bold{\\n}
|
---|
260 | \i Matches the ASCII line feed (LF, 0x0A, Unix newline).
|
---|
261 | \row \i \bold{\\r}
|
---|
262 | \i Matches the ASCII carriage return (CR, 0x0D).
|
---|
263 | \row \i \bold{\\t}
|
---|
264 | \i Matches the ASCII horizontal tab (HT, 0x09).
|
---|
265 | \row \i \bold{\\v}
|
---|
266 | \i Matches the ASCII vertical tab (VT, 0x0B).
|
---|
267 | \row \i \bold{\\x\e{hhhh}}
|
---|
268 | \i Matches the Unicode character corresponding to the
|
---|
269 | hexadecimal number \e{hhhh} (between 0x0000 and 0xFFFF).
|
---|
270 | \row \i \bold{\\0\e{ooo}} (i.e., \\zero \e{ooo})
|
---|
271 | \i matches the ASCII/Latin1 character for the octal number
|
---|
272 | \e{ooo} (between 0 and 0377).
|
---|
273 | \row \i \bold{. (dot)}
|
---|
274 | \i Matches any character (including newline).
|
---|
275 | \row \i \bold{\\d}
|
---|
276 | \i Matches a digit (QChar::isDigit()).
|
---|
277 | \row \i \bold{\\D}
|
---|
278 | \i Matches a non-digit.
|
---|
279 | \row \i \bold{\\s}
|
---|
280 | \i Matches a whitespace character (QChar::isSpace()).
|
---|
281 | \row \i \bold{\\S}
|
---|
282 | \i Matches a non-whitespace character.
|
---|
283 | \row \i \bold{\\w}
|
---|
284 | \i Matches a word character (QChar::isLetterOrNumber(), QChar::isMark(), or '_').
|
---|
285 | \row \i \bold{\\W}
|
---|
286 | \i Matches a non-word character.
|
---|
287 | \row \i \bold{\\\e{n}}
|
---|
288 | \i The \e{n}-th \l backreference, e.g. \\1, \\2, etc.
|
---|
289 | \endtable
|
---|
290 |
|
---|
291 | \bold{Note:} The C++ compiler transforms backslashes in strings.
|
---|
292 | To include a \bold{\\} in a regexp, enter it twice, i.e. \c{\\}.
|
---|
293 | To match the backslash character itself, enter it four times, i.e.
|
---|
294 | \c{\\\\}.
|
---|
295 |
|
---|
296 | \target sets-of-characters
|
---|
297 | \section1 Sets of Characters
|
---|
298 |
|
---|
299 | Square brackets mean match any character contained in the square
|
---|
300 | brackets. The character set abbreviations described above can
|
---|
301 | appear in a character set in square brackets. Except for the
|
---|
302 | character set abbreviations and the following two exceptions,
|
---|
303 | characters do not have special meanings in square brackets.
|
---|
304 |
|
---|
305 | \table
|
---|
306 | \row \i \bold{^}
|
---|
307 |
|
---|
308 | \i The caret negates the character set if it occurs as the
|
---|
309 | first character (i.e. immediately after the opening square
|
---|
310 | bracket). \bold{[abc]} matches 'a' or 'b' or 'c', but
|
---|
311 | \bold{[^abc]} matches anything \e but 'a' or 'b' or 'c'.
|
---|
312 |
|
---|
313 | \row \i \bold{-}
|
---|
314 |
|
---|
315 | \i The dash indicates a range of characters. \bold{[W-Z]}
|
---|
316 | matches 'W' or 'X' or 'Y' or 'Z'.
|
---|
317 |
|
---|
318 | \endtable
|
---|
319 |
|
---|
320 | Using the predefined character set abbreviations is more portable
|
---|
321 | than using character ranges across platforms and languages. For
|
---|
322 | example, \bold{[0-9]} matches a digit in Western alphabets but
|
---|
323 | \bold{\\d} matches a digit in \e any alphabet.
|
---|
324 |
|
---|
325 | Note: In other regexp documentation, sets of characters are often
|
---|
326 | called "character classes".
|
---|
327 |
|
---|
328 | \target quantifiers
|
---|
329 | \section1 Quantifiers
|
---|
330 |
|
---|
331 | By default, an expression is automatically quantified by
|
---|
332 | \bold{{1,1}}, i.e. it should occur exactly once. In the following
|
---|
333 | list, \bold{\e {E}} stands for expression. An expression is a
|
---|
334 | character, or an abbreviation for a set of characters, or a set of
|
---|
335 | characters in square brackets, or an expression in parentheses.
|
---|
336 |
|
---|
337 | \table
|
---|
338 | \row \i \bold{\e {E}?}
|
---|
339 |
|
---|
340 | \i Matches zero or one occurrences of \e E. This quantifier
|
---|
341 | means \e{The previous expression is optional}, because it
|
---|
342 | will match whether or not the expression is found. \bold{\e
|
---|
343 | {E}?} is the same as \bold{\e {E}{0,1}}. e.g., \bold{dents?}
|
---|
344 | matches 'dent' or 'dents'.
|
---|
345 |
|
---|
346 | \row \i \bold{\e {E}+}
|
---|
347 |
|
---|
348 | \i Matches one or more occurrences of \e E. \bold{\e {E}+} is
|
---|
349 | the same as \bold{\e {E}{1,}}. e.g., \bold{0+} matches '0',
|
---|
350 | '00', '000', etc.
|
---|
351 |
|
---|
352 | \row \i \bold{\e {E}*}
|
---|
353 |
|
---|
354 | \i Matches zero or more occurrences of \e E. It is the same
|
---|
355 | as \bold{\e {E}{0,}}. The \bold{*} quantifier is often used
|
---|
356 | in error where \bold{+} should be used. For example, if
|
---|
357 | \bold{\\s*$} is used in an expression to match strings that
|
---|
358 | end in whitespace, it will match every string because
|
---|
359 | \bold{\\s*$} means \e{Match zero or more whitespaces followed
|
---|
360 | by end of string}. The correct regexp to match strings that
|
---|
361 | have at least one trailing whitespace character is
|
---|
362 | \bold{\\s+$}.
|
---|
363 |
|
---|
364 | \row \i \bold{\e {E}{n}}
|
---|
365 |
|
---|
366 | \i Matches exactly \e n occurrences of \e E. \bold{\e {E}{n}}
|
---|
367 | is the same as repeating \e E \e n times. For example,
|
---|
368 | \bold{x{5}} is the same as \bold{xxxxx}. It is also the same
|
---|
369 | as \bold{\e {E}{n,n}}, e.g. \bold{x{5,5}}.
|
---|
370 |
|
---|
371 | \row \i \bold{\e {E}{n,}}
|
---|
372 | \i Matches at least \e n occurrences of \e E.
|
---|
373 |
|
---|
374 | \row \i \bold{\e {E}{,m}}
|
---|
375 | \i Matches at most \e m occurrences of \e E. \bold{\e {E}{,m}}
|
---|
376 | is the same as \bold{\e {E}{0,m}}.
|
---|
377 |
|
---|
378 | \row \i \bold{\e {E}{n,m}}
|
---|
379 | \i Matches at least \e n and at most \e m occurrences of \e E.
|
---|
380 | \endtable
|
---|
381 |
|
---|
382 | To apply a quantifier to more than just the preceding character,
|
---|
383 | use parentheses to group characters together in an expression. For
|
---|
384 | example, \bold{tag+} matches a 't' followed by an 'a' followed by
|
---|
385 | at least one 'g', whereas \bold{(tag)+} matches at least one
|
---|
386 | occurrence of 'tag'.
|
---|
387 |
|
---|
388 | Note: Quantifiers are normally "greedy". They always match as much
|
---|
389 | text as they can. For example, \bold{0+} matches the first zero it
|
---|
390 | finds and all the consecutive zeros after the first zero. Applied
|
---|
391 | to '20005', it matches'2\underline{000}5'. Quantifiers can be made
|
---|
392 | non-greedy, see setMinimal().
|
---|
393 |
|
---|
394 | \target capturing parentheses
|
---|
395 | \target backreferences
|
---|
396 | \section1 Capturing Text
|
---|
397 |
|
---|
398 | Parentheses allow us to group elements together so that we can
|
---|
399 | quantify and capture them. For example if we have the expression
|
---|
400 | \bold{mail|letter|correspondence} that matches a string we know
|
---|
401 | that \e one of the words matched but not which one. Using
|
---|
402 | parentheses allows us to "capture" whatever is matched within
|
---|
403 | their bounds, so if we used \bold{(mail|letter|correspondence)}
|
---|
404 | and matched this regexp against the string "I sent you some email"
|
---|
405 | we can use the cap() or capturedTexts() functions to extract the
|
---|
406 | matched characters, in this case 'mail'.
|
---|
407 |
|
---|
408 | We can use captured text within the regexp itself. To refer to the
|
---|
409 | captured text we use \e backreferences which are indexed from 1,
|
---|
410 | the same as for cap(). For example we could search for duplicate
|
---|
411 | words in a string using \bold{\\b(\\w+)\\W+\\1\\b} which means match a
|
---|
412 | word boundary followed by one or more word characters followed by
|
---|
413 | one or more non-word characters followed by the same text as the
|
---|
414 | first parenthesized expression followed by a word boundary.
|
---|
415 |
|
---|
416 | If we want to use parentheses purely for grouping and not for
|
---|
417 | capturing we can use the non-capturing syntax, e.g.
|
---|
418 | \bold{(?:green|blue)}. Non-capturing parentheses begin '(?:' and
|
---|
419 | end ')'. In this example we match either 'green' or 'blue' but we
|
---|
420 | do not capture the match so we only know whether or not we matched
|
---|
421 | but not which color we actually found. Using non-capturing
|
---|
422 | parentheses is more efficient than using capturing parentheses
|
---|
423 | since the regexp engine has to do less book-keeping.
|
---|
424 |
|
---|
425 | Both capturing and non-capturing parentheses may be nested.
|
---|
426 |
|
---|
427 | \target greedy quantifiers
|
---|
428 |
|
---|
429 | For historical reasons, quantifiers (e.g. \bold{*}) that apply to
|
---|
430 | capturing parentheses are more "greedy" than other quantifiers.
|
---|
431 | For example, \bold{a*(a)*} will match "aaa" with cap(1) == "aaa".
|
---|
432 | This behavior is different from what other regexp engines do
|
---|
433 | (notably, Perl). To obtain a more intuitive capturing behavior,
|
---|
434 | specify QRegExp::RegExp2 to the QRegExp constructor or call
|
---|
435 | setPatternSyntax(QRegExp::RegExp2).
|
---|
436 |
|
---|
437 | \target cap_in_a_loop
|
---|
438 |
|
---|
439 | When the number of matches cannot be determined in advance, a
|
---|
440 | common idiom is to use cap() in a loop. For example:
|
---|
441 |
|
---|
442 | \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 0
|
---|
443 |
|
---|
444 | \target assertions
|
---|
445 | \section1 Assertions
|
---|
446 |
|
---|
447 | Assertions make some statement about the text at the point where
|
---|
448 | they occur in the regexp but they do not match any characters. In
|
---|
449 | the following list \bold{\e {E}} stands for any expression.
|
---|
450 |
|
---|
451 | \table
|
---|
452 | \row \i \bold{^}
|
---|
453 | \i The caret signifies the beginning of the string. If you
|
---|
454 | wish to match a literal \c{^} you must escape it by
|
---|
455 | writing \c{\\^}. For example, \bold{^#include} will only
|
---|
456 | match strings which \e begin with the characters '#include'.
|
---|
457 | (When the caret is the first character of a character set it
|
---|
458 | has a special meaning, see \link #sets-of-characters Sets of
|
---|
459 | Characters \endlink.)
|
---|
460 |
|
---|
461 | \row \i \bold{$}
|
---|
462 | \i The dollar signifies the end of the string. For example
|
---|
463 | \bold{\\d\\s*$} will match strings which end with a digit
|
---|
464 | optionally followed by whitespace. If you wish to match a
|
---|
465 | literal \c{$} you must escape it by writing
|
---|
466 | \c{\\$}.
|
---|
467 |
|
---|
468 | \row \i \bold{\\b}
|
---|
469 | \i A word boundary. For example the regexp
|
---|
470 | \bold{\\bOK\\b} means match immediately after a word
|
---|
471 | boundary (e.g. start of string or whitespace) the letter 'O'
|
---|
472 | then the letter 'K' immediately before another word boundary
|
---|
473 | (e.g. end of string or whitespace). But note that the
|
---|
474 | assertion does not actually match any whitespace so if we
|
---|
475 | write \bold{(\\bOK\\b)} and we have a match it will only
|
---|
476 | contain 'OK' even if the string is "It's \underline{OK} now".
|
---|
477 |
|
---|
478 | \row \i \bold{\\B}
|
---|
479 | \i A non-word boundary. This assertion is true wherever
|
---|
480 | \bold{\\b} is false. For example if we searched for
|
---|
481 | \bold{\\Bon\\B} in "Left on" the match would fail (space
|
---|
482 | and end of string aren't non-word boundaries), but it would
|
---|
483 | match in "t\underline{on}ne".
|
---|
484 |
|
---|
485 | \row \i \bold{(?=\e E)}
|
---|
486 | \i Positive lookahead. This assertion is true if the
|
---|
487 | expression matches at this point in the regexp. For example,
|
---|
488 | \bold{const(?=\\s+char)} matches 'const' whenever it is
|
---|
489 | followed by 'char', as in 'static \underline{const} char *'.
|
---|
490 | (Compare with \bold{const\\s+char}, which matches 'static
|
---|
491 | \underline{const char} *'.)
|
---|
492 |
|
---|
493 | \row \i \bold{(?!\e E)}
|
---|
494 | \i Negative lookahead. This assertion is true if the
|
---|
495 | expression does not match at this point in the regexp. For
|
---|
496 | example, \bold{const(?!\\s+char)} matches 'const' \e except
|
---|
497 | when it is followed by 'char'.
|
---|
498 | \endtable
|
---|
499 |
|
---|
500 | \keyword QRegExp wildcard matching
|
---|
501 | \section1 Wildcard Matching
|
---|
502 |
|
---|
503 | Most command shells such as \e bash or \e cmd.exe support "file
|
---|
504 | globbing", the ability to identify a group of files by using
|
---|
505 | wildcards. The setPatternSyntax() function is used to switch
|
---|
506 | between regexp and wildcard mode. Wildcard matching is much
|
---|
507 | simpler than full regexps and has only four features:
|
---|
508 |
|
---|
509 | \table
|
---|
510 | \row \i \bold{c}
|
---|
511 | \i Any character represents itself apart from those mentioned
|
---|
512 | below. Thus \bold{c} matches the character \e c.
|
---|
513 | \row \i \bold{?}
|
---|
514 | \i Matches any single character. It is the same as
|
---|
515 | \bold{.} in full regexps.
|
---|
516 | \row \i \bold{*}
|
---|
517 | \i Matches zero or more of any characters. It is the
|
---|
518 | same as \bold{.*} in full regexps.
|
---|
519 | \row \i \bold{[...]}
|
---|
520 | \i Sets of characters can be represented in square brackets,
|
---|
521 | similar to full regexps. Within the character class, like
|
---|
522 | outside, backslash has no special meaning.
|
---|
523 | \endtable
|
---|
524 |
|
---|
525 | In the mode Wildcard, the wildcard characters cannot be
|
---|
526 | escaped. In the mode WildcardUnix, the character '\' escapes the
|
---|
527 | wildcard.
|
---|
528 |
|
---|
529 | For example if we are in wildcard mode and have strings which
|
---|
530 | contain filenames we could identify HTML files with \bold{*.html}.
|
---|
531 | This will match zero or more characters followed by a dot followed
|
---|
532 | by 'h', 't', 'm' and 'l'.
|
---|
533 |
|
---|
534 | To test a string against a wildcard expression, use exactMatch().
|
---|
535 | For example:
|
---|
536 |
|
---|
537 | \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 1
|
---|
538 |
|
---|
539 | \target perl-users
|
---|
540 | \section1 Notes for Perl Users
|
---|
541 |
|
---|
542 | Most of the character class abbreviations supported by Perl are
|
---|
543 | supported by QRegExp, see \link
|
---|
544 | #characters-and-abbreviations-for-sets-of-characters characters
|
---|
545 | and abbreviations for sets of characters \endlink.
|
---|
546 |
|
---|
547 | In QRegExp, apart from within character classes, \c{^} always
|
---|
548 | signifies the start of the string, so carets must always be
|
---|
549 | escaped unless used for that purpose. In Perl the meaning of caret
|
---|
550 | varies automagically depending on where it occurs so escaping it
|
---|
551 | is rarely necessary. The same applies to \c{$} which in
|
---|
552 | QRegExp always signifies the end of the string.
|
---|
553 |
|
---|
554 | QRegExp's quantifiers are the same as Perl's greedy quantifiers
|
---|
555 | (but see the \l{greedy quantifiers}{note above}). Non-greedy
|
---|
556 | matching cannot be applied to individual quantifiers, but can be
|
---|
557 | applied to all the quantifiers in the pattern. For example, to
|
---|
558 | match the Perl regexp \bold{ro+?m} requires:
|
---|
559 |
|
---|
560 | \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 2
|
---|
561 |
|
---|
562 | The equivalent of Perl's \c{/i} option is
|
---|
563 | setCaseSensitivity(Qt::CaseInsensitive).
|
---|
564 |
|
---|
565 | Perl's \c{/g} option can be emulated using a \l{#cap_in_a_loop}{loop}.
|
---|
566 |
|
---|
567 | In QRegExp \bold{.} matches any character, therefore all QRegExp
|
---|
568 | regexps have the equivalent of Perl's \c{/s} option. QRegExp
|
---|
569 | does not have an equivalent to Perl's \c{/m} option, but this
|
---|
570 | can be emulated in various ways for example by splitting the input
|
---|
571 | into lines or by looping with a regexp that searches for newlines.
|
---|
572 |
|
---|
573 | Because QRegExp is string oriented, there are no \\A, \\Z, or \\z
|
---|
574 | assertions. The \\G assertion is not supported but can be emulated
|
---|
575 | in a loop.
|
---|
576 |
|
---|
577 | Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp
|
---|
578 | equivalents for $`, $' or $+. Perl's capturing variables, $1, $2,
|
---|
579 | ... correspond to cap(1) or capturedTexts()[1], cap(2) or
|
---|
580 | capturedTexts()[2], etc.
|
---|
581 |
|
---|
582 | To substitute a pattern use QString::replace().
|
---|
583 |
|
---|
584 | Perl's extended \c{/x} syntax is not supported, nor are
|
---|
585 | directives, e.g. (?i), or regexp comments, e.g. (?#comment). On
|
---|
586 | the other hand, C++'s rules for literal strings can be used to
|
---|
587 | achieve the same:
|
---|
588 |
|
---|
589 | \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 3
|
---|
590 |
|
---|
591 | Both zero-width positive and zero-width negative lookahead
|
---|
592 | assertions (?=pattern) and (?!pattern) are supported with the same
|
---|
593 | syntax as Perl. Perl's lookbehind assertions, "independent"
|
---|
594 | subexpressions and conditional expressions are not supported.
|
---|
595 |
|
---|
596 | Non-capturing parentheses are also supported, with the same
|
---|
597 | (?:pattern) syntax.
|
---|
598 |
|
---|
599 | See QString::split() and QStringList::join() for equivalents
|
---|
600 | to Perl's split and join functions.
|
---|
601 |
|
---|
602 | Note: because C++ transforms \\'s they must be written \e twice in
|
---|
603 | code, e.g. \bold{\\b} must be written \bold{\\\\b}.
|
---|
604 |
|
---|
605 | \target code-examples
|
---|
606 | \section1 Code Examples
|
---|
607 |
|
---|
608 | \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 4
|
---|
609 |
|
---|
610 | The third string matches '\underline{6}'. This is a simple validation
|
---|
611 | regexp for integers in the range 0 to 99.
|
---|
612 |
|
---|
613 | \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 5
|
---|
614 |
|
---|
615 | The second string matches '\underline{This_is-OK}'. We've used the
|
---|
616 | character set abbreviation '\\S' (non-whitespace) and the anchors
|
---|
617 | to match strings which contain no whitespace.
|
---|
618 |
|
---|
619 | In the following example we match strings containing 'mail' or
|
---|
620 | 'letter' or 'correspondence' but only match whole words i.e. not
|
---|
621 | 'email'
|
---|
622 |
|
---|
623 | \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 6
|
---|
624 |
|
---|
625 | The second string matches "Please write the \underline{letter}". The
|
---|
626 | word 'letter' is also captured (because of the parentheses). We
|
---|
627 | can see what text we've captured like this:
|
---|
628 |
|
---|
629 | \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 7
|
---|
630 |
|
---|
631 | This will capture the text from the first set of capturing
|
---|
632 | parentheses (counting capturing left parentheses from left to
|
---|
633 | right). The parentheses are counted from 1 since cap(0) is the
|
---|
634 | whole matched regexp (equivalent to '&' in most regexp engines).
|
---|
635 |
|
---|
636 | \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 8
|
---|
637 |
|
---|
638 | Here we've passed the QRegExp to QString's replace() function to
|
---|
639 | replace the matched text with new text.
|
---|
640 |
|
---|
641 | \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 9
|
---|
642 |
|
---|
643 | We've used the indexIn() function to repeatedly match the regexp in
|
---|
644 | the string. Note that instead of moving forward by one character
|
---|
645 | at a time \c pos++ we could have written \c {pos +=
|
---|
646 | rx.matchedLength()} to skip over the already matched string. The
|
---|
647 | count will equal 3, matching 'One \underline{Eric} another
|
---|
648 | \underline{Eirik}, and an Ericsson. How many Eiriks, \underline{Eric}?'; it
|
---|
649 | doesn't match 'Ericsson' or 'Eiriks' because they are not bounded
|
---|
650 | by non-word boundaries.
|
---|
651 |
|
---|
652 | One common use of regexps is to split lines of delimited data into
|
---|
653 | their component fields.
|
---|
654 |
|
---|
655 | \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 10
|
---|
656 |
|
---|
657 | In this example our input lines have the format company name, web
|
---|
658 | address and country. Unfortunately the regexp is rather long and
|
---|
659 | not very versatile -- the code will break if we add any more
|
---|
660 | fields. A simpler and better solution is to look for the
|
---|
661 | separator, '\\t' in this case, and take the surrounding text. The
|
---|
662 | QString::split() function can take a separator string or regexp
|
---|
663 | as an argument and split a string accordingly.
|
---|
664 |
|
---|
665 | \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 11
|
---|
666 |
|
---|
667 | Here field[0] is the company, field[1] the web address and so on.
|
---|
668 |
|
---|
669 | To imitate the matching of a shell we can use wildcard mode.
|
---|
670 |
|
---|
671 | \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 12
|
---|
672 |
|
---|
673 | Wildcard matching can be convenient because of its simplicity, but
|
---|
674 | any wildcard regexp can be defined using full regexps, e.g.
|
---|
675 | \bold{.*\.html$}. Notice that we can't match both \c .html and \c
|
---|
676 | .htm files with a wildcard unless we use \bold{*.htm*} which will
|
---|
677 | also match 'test.html.bak'. A full regexp gives us the precision
|
---|
678 | we need, \bold{.*\\.html?$}.
|
---|
679 |
|
---|
680 | QRegExp can match case insensitively using setCaseSensitivity(),
|
---|
681 | and can use non-greedy matching, see setMinimal(). By
|
---|
682 | default QRegExp uses full regexps but this can be changed with
|
---|
683 | setWildcard(). Searching can be forward with indexIn() or backward
|
---|
684 | with lastIndexIn(). Captured text can be accessed using
|
---|
685 | capturedTexts() which returns a string list of all captured
|
---|
686 | strings, or using cap() which returns the captured string for the
|
---|
687 | given index. The pos() function takes a match index and returns
|
---|
688 | the position in the string where the match was made (or -1 if
|
---|
689 | there was no match).
|
---|
690 |
|
---|
691 | \sa QString, QStringList, QRegExpValidator, QSortFilterProxyModel,
|
---|
692 | {tools/regexp}{Regular Expression Example}
|
---|
693 | */
|
---|
694 |
|
---|
695 | #if defined(Q_OS_VXWORKS) && defined(EOS)
|
---|
696 | # undef EOS
|
---|
697 | #endif
|
---|
698 |
|
---|
699 | const int NumBadChars = 64;
|
---|
700 | #define BadChar(ch) ((ch).unicode() % NumBadChars)
|
---|
701 |
|
---|
702 | const int NoOccurrence = INT_MAX;
|
---|
703 | const int EmptyCapture = INT_MAX;
|
---|
704 | const int InftyLen = INT_MAX;
|
---|
705 | const int InftyRep = 1025;
|
---|
706 | const int EOS = -1;
|
---|
707 |
|
---|
708 | static bool isWord(QChar ch)
|
---|
709 | {
|
---|
710 | return ch.isLetterOrNumber() || ch.isMark() || ch == QLatin1Char('_');
|
---|
711 | }
|
---|
712 |
|
---|
713 | /*
|
---|
714 | Merges two vectors of ints and puts the result into the first
|
---|
715 | one.
|
---|
716 | */
|
---|
717 | static void mergeInto(QVector<int> *a, const QVector<int> &b)
|
---|
718 | {
|
---|
719 | int asize = a->size();
|
---|
720 | int bsize = b.size();
|
---|
721 | if (asize == 0) {
|
---|
722 | *a = b;
|
---|
723 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
724 | } else if (bsize == 1 && a->at(asize - 1) < b.at(0)) {
|
---|
725 | a->resize(asize + 1);
|
---|
726 | (*a)[asize] = b.at(0);
|
---|
727 | #endif
|
---|
728 | } else if (bsize >= 1) {
|
---|
729 | int csize = asize + bsize;
|
---|
730 | QVector<int> c(csize);
|
---|
731 | int i = 0, j = 0, k = 0;
|
---|
732 | while (i < asize) {
|
---|
733 | if (j < bsize) {
|
---|
734 | if (a->at(i) == b.at(j)) {
|
---|
735 | ++i;
|
---|
736 | --csize;
|
---|
737 | } else if (a->at(i) < b.at(j)) {
|
---|
738 | c[k++] = a->at(i++);
|
---|
739 | } else {
|
---|
740 | c[k++] = b.at(j++);
|
---|
741 | }
|
---|
742 | } else {
|
---|
743 | memcpy(c.data() + k, a->constData() + i, (asize - i) * sizeof(int));
|
---|
744 | break;
|
---|
745 | }
|
---|
746 | }
|
---|
747 | c.resize(csize);
|
---|
748 | if (j < bsize)
|
---|
749 | memcpy(c.data() + k, b.constData() + j, (bsize - j) * sizeof(int));
|
---|
750 | *a = c;
|
---|
751 | }
|
---|
752 | }
|
---|
753 |
|
---|
754 | #ifndef QT_NO_REGEXP_WILDCARD
|
---|
755 | /*
|
---|
756 | Translates a wildcard pattern to an equivalent regular expression
|
---|
757 | pattern (e.g., *.cpp to .*\.cpp).
|
---|
758 |
|
---|
759 | If enableEscaping is true, it is possible to escape the wildcard
|
---|
760 | characters with \
|
---|
761 | */
|
---|
762 | static QString wc2rx(const QString &wc_str, const bool enableEscaping)
|
---|
763 | {
|
---|
764 | const int wclen = wc_str.length();
|
---|
765 | QString rx;
|
---|
766 | int i = 0;
|
---|
767 | bool isEscaping = false; // the previous character is '\'
|
---|
768 | const QChar *wc = wc_str.unicode();
|
---|
769 |
|
---|
770 | while (i < wclen) {
|
---|
771 | const QChar c = wc[i++];
|
---|
772 | switch (c.unicode()) {
|
---|
773 | case '\\':
|
---|
774 | if (enableEscaping) {
|
---|
775 | if (isEscaping) {
|
---|
776 | rx += QLatin1String("\\\\");
|
---|
777 | } // we insert the \\ later if necessary
|
---|
778 | if (i+1 == wclen) { // the end
|
---|
779 | rx += QLatin1String("\\\\");
|
---|
780 | }
|
---|
781 | } else {
|
---|
782 | rx += QLatin1String("\\\\");
|
---|
783 | }
|
---|
784 | isEscaping = true;
|
---|
785 | break;
|
---|
786 | case '*':
|
---|
787 | if (isEscaping) {
|
---|
788 | rx += QLatin1String("\\*");
|
---|
789 | isEscaping = false;
|
---|
790 | } else {
|
---|
791 | rx += QLatin1String(".*");
|
---|
792 | }
|
---|
793 | break;
|
---|
794 | case '?':
|
---|
795 | if (isEscaping) {
|
---|
796 | rx += QLatin1String("\\?");
|
---|
797 | isEscaping = false;
|
---|
798 | } else {
|
---|
799 | rx += QLatin1Char('.');
|
---|
800 | }
|
---|
801 |
|
---|
802 | break;
|
---|
803 | case '$':
|
---|
804 | case '(':
|
---|
805 | case ')':
|
---|
806 | case '+':
|
---|
807 | case '.':
|
---|
808 | case '^':
|
---|
809 | case '{':
|
---|
810 | case '|':
|
---|
811 | case '}':
|
---|
812 | if (isEscaping) {
|
---|
813 | isEscaping = false;
|
---|
814 | rx += QLatin1String("\\\\");
|
---|
815 | }
|
---|
816 | rx += QLatin1Char('\\');
|
---|
817 | rx += c;
|
---|
818 | break;
|
---|
819 | case '[':
|
---|
820 | if (isEscaping) {
|
---|
821 | isEscaping = false;
|
---|
822 | rx += QLatin1String("\\[");
|
---|
823 | } else {
|
---|
824 | rx += c;
|
---|
825 | if (wc[i] == QLatin1Char('^'))
|
---|
826 | rx += wc[i++];
|
---|
827 | if (i < wclen) {
|
---|
828 | if (rx[i] == QLatin1Char(']'))
|
---|
829 | rx += wc[i++];
|
---|
830 | while (i < wclen && wc[i] != QLatin1Char(']')) {
|
---|
831 | if (wc[i] == QLatin1Char('\\'))
|
---|
832 | rx += QLatin1Char('\\');
|
---|
833 | rx += wc[i++];
|
---|
834 | }
|
---|
835 | }
|
---|
836 | }
|
---|
837 | break;
|
---|
838 |
|
---|
839 | case ']':
|
---|
840 | if(isEscaping){
|
---|
841 | isEscaping = false;
|
---|
842 | rx += QLatin1String("\\");
|
---|
843 | }
|
---|
844 | rx += c;
|
---|
845 | break;
|
---|
846 |
|
---|
847 | default:
|
---|
848 | if(isEscaping){
|
---|
849 | isEscaping = false;
|
---|
850 | rx += QLatin1String("\\\\");
|
---|
851 | }
|
---|
852 | rx += c;
|
---|
853 | }
|
---|
854 | }
|
---|
855 | return rx;
|
---|
856 | }
|
---|
857 | #endif
|
---|
858 |
|
---|
859 | static int caretIndex(int offset, QRegExp::CaretMode caretMode)
|
---|
860 | {
|
---|
861 | if (caretMode == QRegExp::CaretAtZero) {
|
---|
862 | return 0;
|
---|
863 | } else if (caretMode == QRegExp::CaretAtOffset) {
|
---|
864 | return offset;
|
---|
865 | } else { // QRegExp::CaretWontMatch
|
---|
866 | return -1;
|
---|
867 | }
|
---|
868 | }
|
---|
869 |
|
---|
870 | /*
|
---|
871 | The QRegExpEngineKey struct uniquely identifies an engine.
|
---|
872 | */
|
---|
873 | struct QRegExpEngineKey
|
---|
874 | {
|
---|
875 | QString pattern;
|
---|
876 | QRegExp::PatternSyntax patternSyntax;
|
---|
877 | Qt::CaseSensitivity cs;
|
---|
878 |
|
---|
879 | inline QRegExpEngineKey(const QString &pattern, QRegExp::PatternSyntax patternSyntax,
|
---|
880 | Qt::CaseSensitivity cs)
|
---|
881 | : pattern(pattern), patternSyntax(patternSyntax), cs(cs) {}
|
---|
882 |
|
---|
883 | inline void clear() {
|
---|
884 | pattern.clear();
|
---|
885 | patternSyntax = QRegExp::RegExp;
|
---|
886 | cs = Qt::CaseSensitive;
|
---|
887 | }
|
---|
888 | };
|
---|
889 |
|
---|
890 | Q_STATIC_GLOBAL_OPERATOR bool operator==(const QRegExpEngineKey &key1, const QRegExpEngineKey &key2)
|
---|
891 | {
|
---|
892 | return key1.pattern == key2.pattern && key1.patternSyntax == key2.patternSyntax
|
---|
893 | && key1.cs == key2.cs;
|
---|
894 | }
|
---|
895 |
|
---|
896 | class QRegExpEngine;
|
---|
897 |
|
---|
898 | //Q_DECLARE_TYPEINFO(QVector<int>, Q_MOVABLE_TYPE);
|
---|
899 |
|
---|
900 | /*
|
---|
901 | This is the engine state during matching.
|
---|
902 | */
|
---|
903 | struct QRegExpMatchState
|
---|
904 | {
|
---|
905 | const QChar *in; // a pointer to the input string data
|
---|
906 | int pos; // the current position in the string
|
---|
907 | int caretPos;
|
---|
908 | int len; // the length of the input string
|
---|
909 | bool minimal; // minimal matching?
|
---|
910 | int *bigArray; // big array holding the data for the next pointers
|
---|
911 | int *inNextStack; // is state is nextStack?
|
---|
912 | int *curStack; // stack of current states
|
---|
913 | int *nextStack; // stack of next states
|
---|
914 | int *curCapBegin; // start of current states' captures
|
---|
915 | int *nextCapBegin; // start of next states' captures
|
---|
916 | int *curCapEnd; // end of current states' captures
|
---|
917 | int *nextCapEnd; // end of next states' captures
|
---|
918 | int *tempCapBegin; // start of temporary captures
|
---|
919 | int *tempCapEnd; // end of temporary captures
|
---|
920 | int *capBegin; // start of captures for a next state
|
---|
921 | int *capEnd; // end of captures for a next state
|
---|
922 | int *slideTab; // bump-along slide table for bad-character heuristic
|
---|
923 | int *captured; // what match() returned last
|
---|
924 | int slideTabSize; // size of slide table
|
---|
925 | int capturedSize;
|
---|
926 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
927 | QList<QVector<int> > sleeping; // list of back-reference sleepers
|
---|
928 | #endif
|
---|
929 | int matchLen; // length of match
|
---|
930 | int oneTestMatchedLen; // length of partial match
|
---|
931 |
|
---|
932 | const QRegExpEngine *eng;
|
---|
933 |
|
---|
934 | inline QRegExpMatchState() : bigArray(0), captured(0) {}
|
---|
935 | inline ~QRegExpMatchState() { free(bigArray); }
|
---|
936 |
|
---|
937 | void drain() { free(bigArray); bigArray = 0; captured = 0; } // to save memory
|
---|
938 | void prepareForMatch(QRegExpEngine *eng);
|
---|
939 | void match(const QChar *str, int len, int pos, bool minimal,
|
---|
940 | bool oneTest, int caretIndex);
|
---|
941 | bool matchHere();
|
---|
942 | bool testAnchor(int i, int a, const int *capBegin);
|
---|
943 | };
|
---|
944 |
|
---|
945 | /*
|
---|
946 | The struct QRegExpAutomatonState represents one state in a modified NFA. The
|
---|
947 | input characters matched are stored in the state instead of on
|
---|
948 | the transitions, something possible for an automaton
|
---|
949 | constructed from a regular expression.
|
---|
950 | */
|
---|
951 | struct QRegExpAutomatonState
|
---|
952 | {
|
---|
953 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
954 | int atom; // which atom does this state belong to?
|
---|
955 | #endif
|
---|
956 | int match; // what does it match? (see CharClassBit and BackRefBit)
|
---|
957 | QVector<int> outs; // out-transitions
|
---|
958 | QMap<int, int> reenter; // atoms reentered when transiting out
|
---|
959 | QMap<int, int> anchors; // anchors met when transiting out
|
---|
960 |
|
---|
961 | inline QRegExpAutomatonState() { }
|
---|
962 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
963 | inline QRegExpAutomatonState(int a, int m)
|
---|
964 | : atom(a), match(m) { }
|
---|
965 | #else
|
---|
966 | inline QRegExpAutomatonState(int m)
|
---|
967 | : match(m) { }
|
---|
968 | #endif
|
---|
969 | };
|
---|
970 |
|
---|
971 | Q_DECLARE_TYPEINFO(QRegExpAutomatonState, Q_MOVABLE_TYPE);
|
---|
972 |
|
---|
973 | /*
|
---|
974 | The struct QRegExpCharClassRange represents a range of characters (e.g.,
|
---|
975 | [0-9] denotes range 48 to 57).
|
---|
976 | */
|
---|
977 | struct QRegExpCharClassRange
|
---|
978 | {
|
---|
979 | ushort from; // 48
|
---|
980 | ushort len; // 10
|
---|
981 | };
|
---|
982 |
|
---|
983 | Q_DECLARE_TYPEINFO(QRegExpCharClassRange, Q_PRIMITIVE_TYPE);
|
---|
984 |
|
---|
985 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
986 | /*
|
---|
987 | The struct QRegExpAtom represents one node in the hierarchy of regular
|
---|
988 | expression atoms.
|
---|
989 | */
|
---|
990 | struct QRegExpAtom
|
---|
991 | {
|
---|
992 | enum { NoCapture = -1, OfficialCapture = -2, UnofficialCapture = -3 };
|
---|
993 |
|
---|
994 | int parent; // index of parent in array of atoms
|
---|
995 | int capture; // index of capture, from 1 to ncap - 1
|
---|
996 | };
|
---|
997 |
|
---|
998 | Q_DECLARE_TYPEINFO(QRegExpAtom, Q_PRIMITIVE_TYPE);
|
---|
999 | #endif
|
---|
1000 |
|
---|
1001 | struct QRegExpLookahead;
|
---|
1002 |
|
---|
1003 | #ifndef QT_NO_REGEXP_ANCHOR_ALT
|
---|
1004 | /*
|
---|
1005 | The struct QRegExpAnchorAlternation represents a pair of anchors with
|
---|
1006 | OR semantics.
|
---|
1007 | */
|
---|
1008 | struct QRegExpAnchorAlternation
|
---|
1009 | {
|
---|
1010 | int a; // this anchor...
|
---|
1011 | int b; // ...or this one
|
---|
1012 | };
|
---|
1013 |
|
---|
1014 | Q_DECLARE_TYPEINFO(QRegExpAnchorAlternation, Q_PRIMITIVE_TYPE);
|
---|
1015 | #endif
|
---|
1016 |
|
---|
1017 | #ifndef QT_NO_REGEXP_CCLASS
|
---|
1018 | /*
|
---|
1019 | The class QRegExpCharClass represents a set of characters, such as can
|
---|
1020 | be found in regular expressions (e.g., [a-z] denotes the set
|
---|
1021 | {a, b, ..., z}).
|
---|
1022 | */
|
---|
1023 | class QRegExpCharClass
|
---|
1024 | {
|
---|
1025 | public:
|
---|
1026 | QRegExpCharClass();
|
---|
1027 | inline QRegExpCharClass(const QRegExpCharClass &cc) { operator=(cc); }
|
---|
1028 |
|
---|
1029 | QRegExpCharClass &operator=(const QRegExpCharClass &cc);
|
---|
1030 |
|
---|
1031 | void clear();
|
---|
1032 | bool negative() const { return n; }
|
---|
1033 | void setNegative(bool negative);
|
---|
1034 | void addCategories(int cats);
|
---|
1035 | void addRange(ushort from, ushort to);
|
---|
1036 | void addSingleton(ushort ch) { addRange(ch, ch); }
|
---|
1037 |
|
---|
1038 | bool in(QChar ch) const;
|
---|
1039 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1040 | const QVector<int> &firstOccurrence() const { return occ1; }
|
---|
1041 | #endif
|
---|
1042 |
|
---|
1043 | #if defined(QT_DEBUG)
|
---|
1044 | void dump() const;
|
---|
1045 | #endif
|
---|
1046 |
|
---|
1047 | private:
|
---|
1048 | int c; // character classes
|
---|
1049 | QVector<QRegExpCharClassRange> r; // character ranges
|
---|
1050 | bool n; // negative?
|
---|
1051 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1052 | QVector<int> occ1; // first-occurrence array
|
---|
1053 | #endif
|
---|
1054 | };
|
---|
1055 | #else
|
---|
1056 | struct QRegExpCharClass
|
---|
1057 | {
|
---|
1058 | int dummy;
|
---|
1059 |
|
---|
1060 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1061 | QRegExpCharClass() { occ1.fill(0, NumBadChars); }
|
---|
1062 |
|
---|
1063 | const QVector<int> &firstOccurrence() const { return occ1; }
|
---|
1064 | QVector<int> occ1;
|
---|
1065 | #endif
|
---|
1066 | };
|
---|
1067 | #endif
|
---|
1068 |
|
---|
1069 | Q_DECLARE_TYPEINFO(QRegExpCharClass, Q_MOVABLE_TYPE);
|
---|
1070 |
|
---|
1071 | /*
|
---|
1072 | The QRegExpEngine class encapsulates a modified nondeterministic
|
---|
1073 | finite automaton (NFA).
|
---|
1074 | */
|
---|
1075 | class QRegExpEngine
|
---|
1076 | {
|
---|
1077 | public:
|
---|
1078 | QRegExpEngine(Qt::CaseSensitivity cs, bool greedyQuantifiers)
|
---|
1079 | : cs(cs), greedyQuantifiers(greedyQuantifiers) { setup(); }
|
---|
1080 |
|
---|
1081 | QRegExpEngine(const QRegExpEngineKey &key);
|
---|
1082 | ~QRegExpEngine();
|
---|
1083 |
|
---|
1084 | bool isValid() const { return valid; }
|
---|
1085 | const QString &errorString() const { return yyError; }
|
---|
1086 | int captureCount() const { return officialncap; }
|
---|
1087 |
|
---|
1088 | int createState(QChar ch);
|
---|
1089 | int createState(const QRegExpCharClass &cc);
|
---|
1090 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
1091 | int createState(int bref);
|
---|
1092 | #endif
|
---|
1093 |
|
---|
1094 | void addCatTransitions(const QVector<int> &from, const QVector<int> &to);
|
---|
1095 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
1096 | void addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom);
|
---|
1097 | #endif
|
---|
1098 |
|
---|
1099 | #ifndef QT_NO_REGEXP_ANCHOR_ALT
|
---|
1100 | int anchorAlternation(int a, int b);
|
---|
1101 | int anchorConcatenation(int a, int b);
|
---|
1102 | #else
|
---|
1103 | int anchorAlternation(int a, int b) { return a & b; }
|
---|
1104 | int anchorConcatenation(int a, int b) { return a | b; }
|
---|
1105 | #endif
|
---|
1106 | void addAnchors(int from, int to, int a);
|
---|
1107 |
|
---|
1108 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1109 | void heuristicallyChooseHeuristic();
|
---|
1110 | #endif
|
---|
1111 |
|
---|
1112 | #if defined(QT_DEBUG)
|
---|
1113 | void dump() const;
|
---|
1114 | #endif
|
---|
1115 |
|
---|
1116 | QAtomicInt ref;
|
---|
1117 |
|
---|
1118 | private:
|
---|
1119 | enum { CharClassBit = 0x10000, BackRefBit = 0x20000 };
|
---|
1120 | enum { InitialState = 0, FinalState = 1 };
|
---|
1121 |
|
---|
1122 | void setup();
|
---|
1123 | int setupState(int match);
|
---|
1124 |
|
---|
1125 | /*
|
---|
1126 | Let's hope that 13 lookaheads and 14 back-references are
|
---|
1127 | enough.
|
---|
1128 | */
|
---|
1129 | enum { MaxLookaheads = 13, MaxBackRefs = 14 };
|
---|
1130 | enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, Anchor_Word = 0x00000004,
|
---|
1131 | Anchor_NonWord = 0x00000008, Anchor_FirstLookahead = 0x00000010,
|
---|
1132 | Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads,
|
---|
1133 | Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1,
|
---|
1134 | Anchor_Alternation = unsigned(Anchor_BackRef1Empty) << MaxBackRefs,
|
---|
1135 |
|
---|
1136 | Anchor_LookaheadMask = (Anchor_FirstLookahead - 1) ^
|
---|
1137 | ((Anchor_FirstLookahead << MaxLookaheads) - 1) };
|
---|
1138 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
1139 | int startAtom(bool officialCapture);
|
---|
1140 | void finishAtom(int atom, bool needCapture);
|
---|
1141 | #endif
|
---|
1142 |
|
---|
1143 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
---|
1144 | int addLookahead(QRegExpEngine *eng, bool negative);
|
---|
1145 | #endif
|
---|
1146 |
|
---|
1147 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1148 | bool goodStringMatch(QRegExpMatchState &matchState) const;
|
---|
1149 | bool badCharMatch(QRegExpMatchState &matchState) const;
|
---|
1150 | #else
|
---|
1151 | bool bruteMatch(QRegExpMatchState &matchState) const;
|
---|
1152 | #endif
|
---|
1153 |
|
---|
1154 | QVector<QRegExpAutomatonState> s; // array of states
|
---|
1155 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
1156 | QVector<QRegExpAtom> f; // atom hierarchy
|
---|
1157 | int nf; // number of atoms
|
---|
1158 | int cf; // current atom
|
---|
1159 | QVector<int> captureForOfficialCapture;
|
---|
1160 | #endif
|
---|
1161 | int officialncap; // number of captures, seen from the outside
|
---|
1162 | int ncap; // number of captures, seen from the inside
|
---|
1163 | #ifndef QT_NO_REGEXP_CCLASS
|
---|
1164 | QVector<QRegExpCharClass> cl; // array of character classes
|
---|
1165 | #endif
|
---|
1166 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
---|
1167 | QVector<QRegExpLookahead *> ahead; // array of lookaheads
|
---|
1168 | #endif
|
---|
1169 | #ifndef QT_NO_REGEXP_ANCHOR_ALT
|
---|
1170 | QVector<QRegExpAnchorAlternation> aa; // array of (a, b) pairs of anchors
|
---|
1171 | #endif
|
---|
1172 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1173 | bool caretAnchored; // does the regexp start with ^?
|
---|
1174 | bool trivial; // is the good-string all that needs to match?
|
---|
1175 | #endif
|
---|
1176 | bool valid; // is the regular expression valid?
|
---|
1177 | Qt::CaseSensitivity cs; // case sensitive?
|
---|
1178 | bool greedyQuantifiers; // RegExp2?
|
---|
1179 | bool xmlSchemaExtensions;
|
---|
1180 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
1181 | int nbrefs; // number of back-references
|
---|
1182 | #endif
|
---|
1183 |
|
---|
1184 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1185 | bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch
|
---|
1186 |
|
---|
1187 | int goodEarlyStart; // the index where goodStr can first occur in a match
|
---|
1188 | int goodLateStart; // the index where goodStr can last occur in a match
|
---|
1189 | QString goodStr; // the string that any match has to contain
|
---|
1190 |
|
---|
1191 | int minl; // the minimum length of a match
|
---|
1192 | QVector<int> occ1; // first-occurrence array
|
---|
1193 | #endif
|
---|
1194 |
|
---|
1195 | /*
|
---|
1196 | The class Box is an abstraction for a regular expression
|
---|
1197 | fragment. It can also be seen as one node in the syntax tree of
|
---|
1198 | a regular expression with synthetized attributes.
|
---|
1199 |
|
---|
1200 | Its interface is ugly for performance reasons.
|
---|
1201 | */
|
---|
1202 | class Box
|
---|
1203 | {
|
---|
1204 | public:
|
---|
1205 | Box(QRegExpEngine *engine);
|
---|
1206 | Box(const Box &b) { operator=(b); }
|
---|
1207 |
|
---|
1208 | Box &operator=(const Box &b);
|
---|
1209 |
|
---|
1210 | void clear() { operator=(Box(eng)); }
|
---|
1211 | void set(QChar ch);
|
---|
1212 | void set(const QRegExpCharClass &cc);
|
---|
1213 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
1214 | void set(int bref);
|
---|
1215 | #endif
|
---|
1216 |
|
---|
1217 | void cat(const Box &b);
|
---|
1218 | void orx(const Box &b);
|
---|
1219 | void plus(int atom);
|
---|
1220 | void opt();
|
---|
1221 | void catAnchor(int a);
|
---|
1222 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1223 | void setupHeuristics();
|
---|
1224 | #endif
|
---|
1225 |
|
---|
1226 | #if defined(QT_DEBUG)
|
---|
1227 | void dump() const;
|
---|
1228 | #endif
|
---|
1229 |
|
---|
1230 | private:
|
---|
1231 | void addAnchorsToEngine(const Box &to) const;
|
---|
1232 |
|
---|
1233 | QRegExpEngine *eng; // the automaton under construction
|
---|
1234 | QVector<int> ls; // the left states (firstpos)
|
---|
1235 | QVector<int> rs; // the right states (lastpos)
|
---|
1236 | QMap<int, int> lanchors; // the left anchors
|
---|
1237 | QMap<int, int> ranchors; // the right anchors
|
---|
1238 | int skipanchors; // the anchors to match if the box is skipped
|
---|
1239 |
|
---|
1240 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1241 | int earlyStart; // the index where str can first occur
|
---|
1242 | int lateStart; // the index where str can last occur
|
---|
1243 | QString str; // a string that has to occur in any match
|
---|
1244 | QString leftStr; // a string occurring at the left of this box
|
---|
1245 | QString rightStr; // a string occurring at the right of this box
|
---|
1246 | int maxl; // the maximum length of this box (possibly InftyLen)
|
---|
1247 | #endif
|
---|
1248 |
|
---|
1249 | int minl; // the minimum length of this box
|
---|
1250 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1251 | QVector<int> occ1; // first-occurrence array
|
---|
1252 | #endif
|
---|
1253 | };
|
---|
1254 |
|
---|
1255 | friend class Box;
|
---|
1256 |
|
---|
1257 | void setupCategoriesRangeMap();
|
---|
1258 |
|
---|
1259 | /*
|
---|
1260 | This is the lexical analyzer for regular expressions.
|
---|
1261 | */
|
---|
1262 | enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, Tok_PosLookahead,
|
---|
1263 | Tok_NegLookahead, Tok_RightParen, Tok_CharClass, Tok_Caret, Tok_Quantifier, Tok_Bar,
|
---|
1264 | Tok_Word, Tok_NonWord, Tok_Char = 0x10000, Tok_BackRef = 0x20000 };
|
---|
1265 | int getChar();
|
---|
1266 | int getEscape();
|
---|
1267 | #ifndef QT_NO_REGEXP_INTERVAL
|
---|
1268 | int getRep(int def);
|
---|
1269 | #endif
|
---|
1270 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
---|
1271 | void skipChars(int n);
|
---|
1272 | #endif
|
---|
1273 | void error(const char *msg);
|
---|
1274 | void startTokenizer(const QChar *rx, int len);
|
---|
1275 | int getToken();
|
---|
1276 |
|
---|
1277 | const QChar *yyIn; // a pointer to the input regular expression pattern
|
---|
1278 | int yyPos0; // the position of yyTok in the input pattern
|
---|
1279 | int yyPos; // the position of the next character to read
|
---|
1280 | int yyLen; // the length of yyIn
|
---|
1281 | int yyCh; // the last character read
|
---|
1282 | QScopedPointer<QRegExpCharClass> yyCharClass; // attribute for Tok_CharClass tokens
|
---|
1283 | int yyMinRep; // attribute for Tok_Quantifier
|
---|
1284 | int yyMaxRep; // ditto
|
---|
1285 | QString yyError; // syntax error or overflow during parsing?
|
---|
1286 |
|
---|
1287 | /*
|
---|
1288 | This is the syntactic analyzer for regular expressions.
|
---|
1289 | */
|
---|
1290 | int parse(const QChar *rx, int len);
|
---|
1291 | void parseAtom(Box *box);
|
---|
1292 | void parseFactor(Box *box);
|
---|
1293 | void parseTerm(Box *box);
|
---|
1294 | void parseExpression(Box *box);
|
---|
1295 |
|
---|
1296 | int yyTok; // the last token read
|
---|
1297 | bool yyMayCapture; // set this to false to disable capturing
|
---|
1298 | QHash<QByteArray, QPair<int, int> > categoriesRangeMap; // fast lookup hash for xml schema extensions
|
---|
1299 |
|
---|
1300 | friend struct QRegExpMatchState;
|
---|
1301 | };
|
---|
1302 |
|
---|
1303 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
---|
1304 | /*
|
---|
1305 | The struct QRegExpLookahead represents a lookahead a la Perl (e.g.,
|
---|
1306 | (?=foo) and (?!bar)).
|
---|
1307 | */
|
---|
1308 | struct QRegExpLookahead
|
---|
1309 | {
|
---|
1310 | QRegExpEngine *eng; // NFA representing the embedded regular expression
|
---|
1311 | bool neg; // negative lookahead?
|
---|
1312 |
|
---|
1313 | inline QRegExpLookahead(QRegExpEngine *eng0, bool neg0)
|
---|
1314 | : eng(eng0), neg(neg0) { }
|
---|
1315 | inline ~QRegExpLookahead() { delete eng; }
|
---|
1316 | };
|
---|
1317 | #endif
|
---|
1318 |
|
---|
1319 | /*! \internal
|
---|
1320 | convert the pattern string to the RegExp syntax.
|
---|
1321 |
|
---|
1322 | This is also used by QScriptEngine::newRegExp to convert to a pattern that JavaScriptCore can understan
|
---|
1323 | */
|
---|
1324 | Q_CORE_EXPORT QString qt_regexp_toCanonical(const QString &pattern, QRegExp::PatternSyntax patternSyntax)
|
---|
1325 | {
|
---|
1326 | switch (patternSyntax) {
|
---|
1327 | #ifndef QT_NO_REGEXP_WILDCARD
|
---|
1328 | case QRegExp::Wildcard:
|
---|
1329 | return wc2rx(pattern, false);
|
---|
1330 | break;
|
---|
1331 | case QRegExp::WildcardUnix:
|
---|
1332 | return wc2rx(pattern, true);
|
---|
1333 | break;
|
---|
1334 | #endif
|
---|
1335 | case QRegExp::FixedString:
|
---|
1336 | return QRegExp::escape(pattern);
|
---|
1337 | break;
|
---|
1338 | case QRegExp::W3CXmlSchema11:
|
---|
1339 | default:
|
---|
1340 | return pattern;
|
---|
1341 | }
|
---|
1342 | }
|
---|
1343 |
|
---|
1344 | QRegExpEngine::QRegExpEngine(const QRegExpEngineKey &key)
|
---|
1345 | : cs(key.cs), greedyQuantifiers(key.patternSyntax == QRegExp::RegExp2),
|
---|
1346 | xmlSchemaExtensions(key.patternSyntax == QRegExp::W3CXmlSchema11)
|
---|
1347 | {
|
---|
1348 | setup();
|
---|
1349 |
|
---|
1350 | QString rx = qt_regexp_toCanonical(key.pattern, key.patternSyntax);
|
---|
1351 |
|
---|
1352 | valid = (parse(rx.unicode(), rx.length()) == rx.length());
|
---|
1353 | if (!valid) {
|
---|
1354 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1355 | trivial = false;
|
---|
1356 | #endif
|
---|
1357 | error(RXERR_LEFTDELIM);
|
---|
1358 | }
|
---|
1359 | }
|
---|
1360 |
|
---|
1361 | QRegExpEngine::~QRegExpEngine()
|
---|
1362 | {
|
---|
1363 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
---|
1364 | qDeleteAll(ahead);
|
---|
1365 | #endif
|
---|
1366 | }
|
---|
1367 |
|
---|
1368 | void QRegExpMatchState::prepareForMatch(QRegExpEngine *eng)
|
---|
1369 | {
|
---|
1370 | /*
|
---|
1371 | We use one QVector<int> for all the big data used a lot in
|
---|
1372 | matchHere() and friends.
|
---|
1373 | */
|
---|
1374 | int ns = eng->s.size(); // number of states
|
---|
1375 | int ncap = eng->ncap;
|
---|
1376 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1377 | int newSlideTabSize = qMax(eng->minl + 1, 16);
|
---|
1378 | #else
|
---|
1379 | int newSlideTabSize = 0;
|
---|
1380 | #endif
|
---|
1381 | int numCaptures = eng->captureCount();
|
---|
1382 | int newCapturedSize = 2 + 2 * numCaptures;
|
---|
1383 | bigArray = q_check_ptr((int *)realloc(bigArray, ((3 + 4 * ncap) * ns + 4 * ncap + newSlideTabSize + newCapturedSize)*sizeof(int)));
|
---|
1384 |
|
---|
1385 | // set all internal variables only _after_ bigArray is realloc'ed
|
---|
1386 | // to prevent a broken regexp in oom case
|
---|
1387 |
|
---|
1388 | slideTabSize = newSlideTabSize;
|
---|
1389 | capturedSize = newCapturedSize;
|
---|
1390 | inNextStack = bigArray;
|
---|
1391 | memset(inNextStack, -1, ns * sizeof(int));
|
---|
1392 | curStack = inNextStack + ns;
|
---|
1393 | nextStack = inNextStack + 2 * ns;
|
---|
1394 |
|
---|
1395 | curCapBegin = inNextStack + 3 * ns;
|
---|
1396 | nextCapBegin = curCapBegin + ncap * ns;
|
---|
1397 | curCapEnd = curCapBegin + 2 * ncap * ns;
|
---|
1398 | nextCapEnd = curCapBegin + 3 * ncap * ns;
|
---|
1399 |
|
---|
1400 | tempCapBegin = curCapBegin + 4 * ncap * ns;
|
---|
1401 | tempCapEnd = tempCapBegin + ncap;
|
---|
1402 | capBegin = tempCapBegin + 2 * ncap;
|
---|
1403 | capEnd = tempCapBegin + 3 * ncap;
|
---|
1404 |
|
---|
1405 | slideTab = tempCapBegin + 4 * ncap;
|
---|
1406 | captured = slideTab + slideTabSize;
|
---|
1407 | memset(captured, -1, capturedSize*sizeof(int));
|
---|
1408 | this->eng = eng;
|
---|
1409 | }
|
---|
1410 |
|
---|
1411 | /*
|
---|
1412 | Tries to match in str and returns an array of (begin, length) pairs
|
---|
1413 | for captured text. If there is no match, all pairs are (-1, -1).
|
---|
1414 | */
|
---|
1415 | void QRegExpMatchState::match(const QChar *str0, int len0, int pos0,
|
---|
1416 | bool minimal0, bool oneTest, int caretIndex)
|
---|
1417 | {
|
---|
1418 | bool matched = false;
|
---|
1419 | QChar char_null;
|
---|
1420 |
|
---|
1421 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1422 | if (eng->trivial && !oneTest) {
|
---|
1423 | pos = qFindString(str0, len0, pos0, eng->goodStr.unicode(), eng->goodStr.length(), eng->cs);
|
---|
1424 | matchLen = eng->goodStr.length();
|
---|
1425 | matched = (pos != -1);
|
---|
1426 | } else
|
---|
1427 | #endif
|
---|
1428 | {
|
---|
1429 | in = str0;
|
---|
1430 | if (in == 0)
|
---|
1431 | in = &char_null;
|
---|
1432 | pos = pos0;
|
---|
1433 | caretPos = caretIndex;
|
---|
1434 | len = len0;
|
---|
1435 | minimal = minimal0;
|
---|
1436 | matchLen = 0;
|
---|
1437 | oneTestMatchedLen = 0;
|
---|
1438 |
|
---|
1439 | if (eng->valid && pos >= 0 && pos <= len) {
|
---|
1440 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1441 | if (oneTest) {
|
---|
1442 | matched = matchHere();
|
---|
1443 | } else {
|
---|
1444 | if (pos <= len - eng->minl) {
|
---|
1445 | if (eng->caretAnchored) {
|
---|
1446 | matched = matchHere();
|
---|
1447 | } else if (eng->useGoodStringHeuristic) {
|
---|
1448 | matched = eng->goodStringMatch(*this);
|
---|
1449 | } else {
|
---|
1450 | matched = eng->badCharMatch(*this);
|
---|
1451 | }
|
---|
1452 | }
|
---|
1453 | }
|
---|
1454 | #else
|
---|
1455 | matched = oneTest ? matchHere() : eng->bruteMatch(*this);
|
---|
1456 | #endif
|
---|
1457 | }
|
---|
1458 | }
|
---|
1459 |
|
---|
1460 | if (matched) {
|
---|
1461 | int *c = captured;
|
---|
1462 | *c++ = pos;
|
---|
1463 | *c++ = matchLen;
|
---|
1464 |
|
---|
1465 | int numCaptures = (capturedSize - 2) >> 1;
|
---|
1466 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
1467 | for (int i = 0; i < numCaptures; ++i) {
|
---|
1468 | int j = eng->captureForOfficialCapture.at(i);
|
---|
1469 | int len = capEnd[j] - capBegin[j];
|
---|
1470 | *c++ = (len > 0) ? pos + capBegin[j] : 0;
|
---|
1471 | *c++ = len;
|
---|
1472 | }
|
---|
1473 | #endif
|
---|
1474 | } else {
|
---|
1475 | // we rely on 2's complement here
|
---|
1476 | memset(captured, -1, capturedSize * sizeof(int));
|
---|
1477 | }
|
---|
1478 | }
|
---|
1479 |
|
---|
1480 | /*
|
---|
1481 | The three following functions add one state to the automaton and
|
---|
1482 | return the number of the state.
|
---|
1483 | */
|
---|
1484 |
|
---|
1485 | int QRegExpEngine::createState(QChar ch)
|
---|
1486 | {
|
---|
1487 | return setupState(ch.unicode());
|
---|
1488 | }
|
---|
1489 |
|
---|
1490 | int QRegExpEngine::createState(const QRegExpCharClass &cc)
|
---|
1491 | {
|
---|
1492 | #ifndef QT_NO_REGEXP_CCLASS
|
---|
1493 | int n = cl.size();
|
---|
1494 | cl += QRegExpCharClass(cc);
|
---|
1495 | return setupState(CharClassBit | n);
|
---|
1496 | #else
|
---|
1497 | Q_UNUSED(cc);
|
---|
1498 | return setupState(CharClassBit);
|
---|
1499 | #endif
|
---|
1500 | }
|
---|
1501 |
|
---|
1502 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
1503 | int QRegExpEngine::createState(int bref)
|
---|
1504 | {
|
---|
1505 | if (bref > nbrefs) {
|
---|
1506 | nbrefs = bref;
|
---|
1507 | if (nbrefs > MaxBackRefs) {
|
---|
1508 | error(RXERR_LIMIT);
|
---|
1509 | return 0;
|
---|
1510 | }
|
---|
1511 | }
|
---|
1512 | return setupState(BackRefBit | bref);
|
---|
1513 | }
|
---|
1514 | #endif
|
---|
1515 |
|
---|
1516 | /*
|
---|
1517 | The two following functions add a transition between all pairs of
|
---|
1518 | states (i, j) where i is found in from, and j is found in to.
|
---|
1519 |
|
---|
1520 | Cat-transitions are distinguished from plus-transitions for
|
---|
1521 | capturing.
|
---|
1522 | */
|
---|
1523 |
|
---|
1524 | void QRegExpEngine::addCatTransitions(const QVector<int> &from, const QVector<int> &to)
|
---|
1525 | {
|
---|
1526 | for (int i = 0; i < from.size(); i++)
|
---|
1527 | mergeInto(&s[from.at(i)].outs, to);
|
---|
1528 | }
|
---|
1529 |
|
---|
1530 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
1531 | void QRegExpEngine::addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom)
|
---|
1532 | {
|
---|
1533 | for (int i = 0; i < from.size(); i++) {
|
---|
1534 | QRegExpAutomatonState &st = s[from.at(i)];
|
---|
1535 | const QVector<int> oldOuts = st.outs;
|
---|
1536 | mergeInto(&st.outs, to);
|
---|
1537 | if (f.at(atom).capture != QRegExpAtom::NoCapture) {
|
---|
1538 | for (int j = 0; j < to.size(); j++) {
|
---|
1539 | // ### st.reenter.contains(to.at(j)) check looks suspicious
|
---|
1540 | if (!st.reenter.contains(to.at(j)) &&
|
---|
1541 | qBinaryFind(oldOuts.constBegin(), oldOuts.constEnd(), to.at(j)) == oldOuts.end())
|
---|
1542 | st.reenter.insert(to.at(j), atom);
|
---|
1543 | }
|
---|
1544 | }
|
---|
1545 | }
|
---|
1546 | }
|
---|
1547 | #endif
|
---|
1548 |
|
---|
1549 | #ifndef QT_NO_REGEXP_ANCHOR_ALT
|
---|
1550 | /*
|
---|
1551 | Returns an anchor that means a OR b.
|
---|
1552 | */
|
---|
1553 | int QRegExpEngine::anchorAlternation(int a, int b)
|
---|
1554 | {
|
---|
1555 | if (((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0)
|
---|
1556 | return a & b;
|
---|
1557 |
|
---|
1558 | int n = aa.size();
|
---|
1559 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1560 | if (n > 0 && aa.at(n - 1).a == a && aa.at(n - 1).b == b)
|
---|
1561 | return Anchor_Alternation | (n - 1);
|
---|
1562 | #endif
|
---|
1563 |
|
---|
1564 | QRegExpAnchorAlternation element = {a, b};
|
---|
1565 | aa.append(element);
|
---|
1566 | return Anchor_Alternation | n;
|
---|
1567 | }
|
---|
1568 |
|
---|
1569 | /*
|
---|
1570 | Returns an anchor that means a AND b.
|
---|
1571 | */
|
---|
1572 | int QRegExpEngine::anchorConcatenation(int a, int b)
|
---|
1573 | {
|
---|
1574 | if (((a | b) & Anchor_Alternation) == 0)
|
---|
1575 | return a | b;
|
---|
1576 | if ((b & Anchor_Alternation) != 0)
|
---|
1577 | qSwap(a, b);
|
---|
1578 |
|
---|
1579 | int aprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).a, b);
|
---|
1580 | int bprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).b, b);
|
---|
1581 | return anchorAlternation(aprime, bprime);
|
---|
1582 | }
|
---|
1583 | #endif
|
---|
1584 |
|
---|
1585 | /*
|
---|
1586 | Adds anchor a on a transition caracterised by its from state and
|
---|
1587 | its to state.
|
---|
1588 | */
|
---|
1589 | void QRegExpEngine::addAnchors(int from, int to, int a)
|
---|
1590 | {
|
---|
1591 | QRegExpAutomatonState &st = s[from];
|
---|
1592 | if (st.anchors.contains(to))
|
---|
1593 | a = anchorAlternation(st.anchors.value(to), a);
|
---|
1594 | st.anchors.insert(to, a);
|
---|
1595 | }
|
---|
1596 |
|
---|
1597 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1598 | /*
|
---|
1599 | This function chooses between the good-string and the bad-character
|
---|
1600 | heuristics. It computes two scores and chooses the heuristic with
|
---|
1601 | the highest score.
|
---|
1602 |
|
---|
1603 | Here are some common-sense constraints on the scores that should be
|
---|
1604 | respected if the formulas are ever modified: (1) If goodStr is
|
---|
1605 | empty, the good-string heuristic scores 0. (2) If the regular
|
---|
1606 | expression is trivial, the good-string heuristic should be used.
|
---|
1607 | (3) If the search is case insensitive, the good-string heuristic
|
---|
1608 | should be used, unless it scores 0. (Case insensitivity turns all
|
---|
1609 | entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is
|
---|
1610 | big, the good-string heuristic should score less.
|
---|
1611 | */
|
---|
1612 | void QRegExpEngine::heuristicallyChooseHeuristic()
|
---|
1613 | {
|
---|
1614 | if (minl == 0) {
|
---|
1615 | useGoodStringHeuristic = false;
|
---|
1616 | } else if (trivial) {
|
---|
1617 | useGoodStringHeuristic = true;
|
---|
1618 | } else {
|
---|
1619 | /*
|
---|
1620 | Magic formula: The good string has to constitute a good
|
---|
1621 | proportion of the minimum-length string, and appear at a
|
---|
1622 | more-or-less known index.
|
---|
1623 | */
|
---|
1624 | int goodStringScore = (64 * goodStr.length() / minl) -
|
---|
1625 | (goodLateStart - goodEarlyStart);
|
---|
1626 | /*
|
---|
1627 | Less magic formula: We pick some characters at random, and
|
---|
1628 | check whether they are good or bad.
|
---|
1629 | */
|
---|
1630 | int badCharScore = 0;
|
---|
1631 | int step = qMax(1, NumBadChars / 32);
|
---|
1632 | for (int i = 1; i < NumBadChars; i += step) {
|
---|
1633 | if (occ1.at(i) == NoOccurrence)
|
---|
1634 | badCharScore += minl;
|
---|
1635 | else
|
---|
1636 | badCharScore += occ1.at(i);
|
---|
1637 | }
|
---|
1638 | badCharScore /= minl;
|
---|
1639 | useGoodStringHeuristic = (goodStringScore > badCharScore);
|
---|
1640 | }
|
---|
1641 | }
|
---|
1642 | #endif
|
---|
1643 |
|
---|
1644 | #if defined(QT_DEBUG)
|
---|
1645 | void QRegExpEngine::dump() const
|
---|
1646 | {
|
---|
1647 | int i, j;
|
---|
1648 | qDebug("Case %ssensitive engine", cs ? "" : "in");
|
---|
1649 | qDebug(" States");
|
---|
1650 | for (i = 0; i < s.size(); i++) {
|
---|
1651 | qDebug(" %d%s", i, i == InitialState ? " (initial)" : i == FinalState ? " (final)" : "");
|
---|
1652 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
1653 | if (nf > 0)
|
---|
1654 | qDebug(" in atom %d", s[i].atom);
|
---|
1655 | #endif
|
---|
1656 | int m = s[i].match;
|
---|
1657 | if ((m & CharClassBit) != 0) {
|
---|
1658 | qDebug(" match character class %d", m ^ CharClassBit);
|
---|
1659 | #ifndef QT_NO_REGEXP_CCLASS
|
---|
1660 | cl[m ^ CharClassBit].dump();
|
---|
1661 | #else
|
---|
1662 | qDebug(" negative character class");
|
---|
1663 | #endif
|
---|
1664 | } else if ((m & BackRefBit) != 0) {
|
---|
1665 | qDebug(" match back-reference %d", m ^ BackRefBit);
|
---|
1666 | } else if (m >= 0x20 && m <= 0x7e) {
|
---|
1667 | qDebug(" match 0x%.4x (%c)", m, m);
|
---|
1668 | } else {
|
---|
1669 | qDebug(" match 0x%.4x", m);
|
---|
1670 | }
|
---|
1671 | for (j = 0; j < s[i].outs.size(); j++) {
|
---|
1672 | int next = s[i].outs[j];
|
---|
1673 | qDebug(" -> %d", next);
|
---|
1674 | if (s[i].reenter.contains(next))
|
---|
1675 | qDebug(" [reenter %d]", s[i].reenter[next]);
|
---|
1676 | if (s[i].anchors.value(next) != 0)
|
---|
1677 | qDebug(" [anchors 0x%.8x]", s[i].anchors[next]);
|
---|
1678 | }
|
---|
1679 | }
|
---|
1680 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
1681 | if (nf > 0) {
|
---|
1682 | qDebug(" Atom Parent Capture");
|
---|
1683 | for (i = 0; i < nf; i++) {
|
---|
1684 | if (f[i].capture == QRegExpAtom::NoCapture) {
|
---|
1685 | qDebug(" %6d %6d nil", i, f[i].parent);
|
---|
1686 | } else {
|
---|
1687 | int cap = f[i].capture;
|
---|
1688 | bool official = captureForOfficialCapture.contains(cap);
|
---|
1689 | qDebug(" %6d %6d %6d %s", i, f[i].parent, f[i].capture,
|
---|
1690 | official ? "official" : "");
|
---|
1691 | }
|
---|
1692 | }
|
---|
1693 | }
|
---|
1694 | #endif
|
---|
1695 | #ifndef QT_NO_REGEXP_ANCHOR_ALT
|
---|
1696 | for (i = 0; i < aa.size(); i++)
|
---|
1697 | qDebug(" Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a, aa[i].b);
|
---|
1698 | #endif
|
---|
1699 | }
|
---|
1700 | #endif
|
---|
1701 |
|
---|
1702 | void QRegExpEngine::setup()
|
---|
1703 | {
|
---|
1704 | ref = 1;
|
---|
1705 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
1706 | f.resize(32);
|
---|
1707 | nf = 0;
|
---|
1708 | cf = -1;
|
---|
1709 | #endif
|
---|
1710 | officialncap = 0;
|
---|
1711 | ncap = 0;
|
---|
1712 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1713 | caretAnchored = true;
|
---|
1714 | trivial = true;
|
---|
1715 | #endif
|
---|
1716 | valid = false;
|
---|
1717 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
1718 | nbrefs = 0;
|
---|
1719 | #endif
|
---|
1720 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1721 | useGoodStringHeuristic = true;
|
---|
1722 | minl = 0;
|
---|
1723 | occ1.fill(0, NumBadChars);
|
---|
1724 | #endif
|
---|
1725 | }
|
---|
1726 |
|
---|
1727 | int QRegExpEngine::setupState(int match)
|
---|
1728 | {
|
---|
1729 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
1730 | s += QRegExpAutomatonState(cf, match);
|
---|
1731 | #else
|
---|
1732 | s += QRegExpAutomatonState(match);
|
---|
1733 | #endif
|
---|
1734 | return s.size() - 1;
|
---|
1735 | }
|
---|
1736 |
|
---|
1737 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
1738 | /*
|
---|
1739 | Functions startAtom() and finishAtom() should be called to delimit
|
---|
1740 | atoms. When a state is created, it is assigned to the current atom.
|
---|
1741 | The information is later used for capturing.
|
---|
1742 | */
|
---|
1743 | int QRegExpEngine::startAtom(bool officialCapture)
|
---|
1744 | {
|
---|
1745 | if ((nf & (nf + 1)) == 0 && nf + 1 >= f.size())
|
---|
1746 | f.resize((nf + 1) << 1);
|
---|
1747 | f[nf].parent = cf;
|
---|
1748 | cf = nf++;
|
---|
1749 | f[cf].capture = officialCapture ? QRegExpAtom::OfficialCapture : QRegExpAtom::NoCapture;
|
---|
1750 | return cf;
|
---|
1751 | }
|
---|
1752 |
|
---|
1753 | void QRegExpEngine::finishAtom(int atom, bool needCapture)
|
---|
1754 | {
|
---|
1755 | if (greedyQuantifiers && needCapture && f[atom].capture == QRegExpAtom::NoCapture)
|
---|
1756 | f[atom].capture = QRegExpAtom::UnofficialCapture;
|
---|
1757 | cf = f.at(atom).parent;
|
---|
1758 | }
|
---|
1759 | #endif
|
---|
1760 |
|
---|
1761 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
---|
1762 | /*
|
---|
1763 | Creates a lookahead anchor.
|
---|
1764 | */
|
---|
1765 | int QRegExpEngine::addLookahead(QRegExpEngine *eng, bool negative)
|
---|
1766 | {
|
---|
1767 | int n = ahead.size();
|
---|
1768 | if (n == MaxLookaheads) {
|
---|
1769 | error(RXERR_LIMIT);
|
---|
1770 | return 0;
|
---|
1771 | }
|
---|
1772 | ahead += new QRegExpLookahead(eng, negative);
|
---|
1773 | return Anchor_FirstLookahead << n;
|
---|
1774 | }
|
---|
1775 | #endif
|
---|
1776 |
|
---|
1777 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
1778 | /*
|
---|
1779 | We want the longest leftmost captures.
|
---|
1780 | */
|
---|
1781 | static bool isBetterCapture(int ncap, const int *begin1, const int *end1, const int *begin2,
|
---|
1782 | const int *end2)
|
---|
1783 | {
|
---|
1784 | for (int i = 0; i < ncap; i++) {
|
---|
1785 | int delta = begin2[i] - begin1[i]; // it has to start early...
|
---|
1786 | if (delta == 0)
|
---|
1787 | delta = end1[i] - end2[i]; // ...and end late
|
---|
1788 |
|
---|
1789 | if (delta != 0)
|
---|
1790 | return delta > 0;
|
---|
1791 | }
|
---|
1792 | return false;
|
---|
1793 | }
|
---|
1794 | #endif
|
---|
1795 |
|
---|
1796 | /*
|
---|
1797 | Returns true if anchor a matches at position pos + i in the input
|
---|
1798 | string, otherwise false.
|
---|
1799 | */
|
---|
1800 | bool QRegExpMatchState::testAnchor(int i, int a, const int *capBegin)
|
---|
1801 | {
|
---|
1802 | int j;
|
---|
1803 |
|
---|
1804 | #ifndef QT_NO_REGEXP_ANCHOR_ALT
|
---|
1805 | if ((a & QRegExpEngine::Anchor_Alternation) != 0)
|
---|
1806 | return testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).a, capBegin)
|
---|
1807 | || testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).b, capBegin);
|
---|
1808 | #endif
|
---|
1809 |
|
---|
1810 | if ((a & QRegExpEngine::Anchor_Caret) != 0) {
|
---|
1811 | if (pos + i != caretPos)
|
---|
1812 | return false;
|
---|
1813 | }
|
---|
1814 | if ((a & QRegExpEngine::Anchor_Dollar) != 0) {
|
---|
1815 | if (pos + i != len)
|
---|
1816 | return false;
|
---|
1817 | }
|
---|
1818 | #ifndef QT_NO_REGEXP_ESCAPE
|
---|
1819 | if ((a & (QRegExpEngine::Anchor_Word | QRegExpEngine::Anchor_NonWord)) != 0) {
|
---|
1820 | bool before = false;
|
---|
1821 | bool after = false;
|
---|
1822 | if (pos + i != 0)
|
---|
1823 | before = isWord(in[pos + i - 1]);
|
---|
1824 | if (pos + i != len)
|
---|
1825 | after = isWord(in[pos + i]);
|
---|
1826 | if ((a & QRegExpEngine::Anchor_Word) != 0 && (before == after))
|
---|
1827 | return false;
|
---|
1828 | if ((a & QRegExpEngine::Anchor_NonWord) != 0 && (before != after))
|
---|
1829 | return false;
|
---|
1830 | }
|
---|
1831 | #endif
|
---|
1832 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
---|
1833 | if ((a & QRegExpEngine::Anchor_LookaheadMask) != 0) {
|
---|
1834 | const QVector<QRegExpLookahead *> &ahead = eng->ahead;
|
---|
1835 | for (j = 0; j < ahead.size(); j++) {
|
---|
1836 | if ((a & (QRegExpEngine::Anchor_FirstLookahead << j)) != 0) {
|
---|
1837 | QRegExpMatchState matchState;
|
---|
1838 | matchState.prepareForMatch(ahead[j]->eng);
|
---|
1839 | matchState.match(in + pos + i, len - pos - i, 0,
|
---|
1840 | true, true, matchState.caretPos - matchState.pos - i);
|
---|
1841 | if ((matchState.captured[0] == 0) == ahead[j]->neg)
|
---|
1842 | return false;
|
---|
1843 | }
|
---|
1844 | }
|
---|
1845 | }
|
---|
1846 | #endif
|
---|
1847 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
1848 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
1849 | for (j = 0; j < eng->nbrefs; j++) {
|
---|
1850 | if ((a & (QRegExpEngine::Anchor_BackRef1Empty << j)) != 0) {
|
---|
1851 | int i = eng->captureForOfficialCapture.at(j);
|
---|
1852 | if (capBegin[i] != EmptyCapture)
|
---|
1853 | return false;
|
---|
1854 | }
|
---|
1855 | }
|
---|
1856 | #endif
|
---|
1857 | #endif
|
---|
1858 | return true;
|
---|
1859 | }
|
---|
1860 |
|
---|
1861 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
1862 | /*
|
---|
1863 | The three following functions are what Jeffrey Friedl would call
|
---|
1864 | transmissions (or bump-alongs). Using one or the other should make
|
---|
1865 | no difference except in performance.
|
---|
1866 | */
|
---|
1867 |
|
---|
1868 | bool QRegExpEngine::goodStringMatch(QRegExpMatchState &matchState) const
|
---|
1869 | {
|
---|
1870 | int k = matchState.pos + goodEarlyStart;
|
---|
1871 | QStringMatcher matcher(goodStr.unicode(), goodStr.length(), cs);
|
---|
1872 | while ((k = matcher.indexIn(matchState.in, matchState.len, k)) != -1) {
|
---|
1873 | int from = k - goodLateStart;
|
---|
1874 | int to = k - goodEarlyStart;
|
---|
1875 | if (from > matchState.pos)
|
---|
1876 | matchState.pos = from;
|
---|
1877 |
|
---|
1878 | while (matchState.pos <= to) {
|
---|
1879 | if (matchState.matchHere())
|
---|
1880 | return true;
|
---|
1881 | ++matchState.pos;
|
---|
1882 | }
|
---|
1883 | ++k;
|
---|
1884 | }
|
---|
1885 | return false;
|
---|
1886 | }
|
---|
1887 |
|
---|
1888 | bool QRegExpEngine::badCharMatch(QRegExpMatchState &matchState) const
|
---|
1889 | {
|
---|
1890 | int slideHead = 0;
|
---|
1891 | int slideNext = 0;
|
---|
1892 | int i;
|
---|
1893 | int lastPos = matchState.len - minl;
|
---|
1894 | memset(matchState.slideTab, 0, matchState.slideTabSize * sizeof(int));
|
---|
1895 |
|
---|
1896 | /*
|
---|
1897 | Set up the slide table, used for the bad-character heuristic,
|
---|
1898 | using the table of first occurrence of each character.
|
---|
1899 | */
|
---|
1900 | for (i = 0; i < minl; i++) {
|
---|
1901 | int sk = occ1[BadChar(matchState.in[matchState.pos + i])];
|
---|
1902 | if (sk == NoOccurrence)
|
---|
1903 | sk = i + 1;
|
---|
1904 | if (sk > 0) {
|
---|
1905 | int k = i + 1 - sk;
|
---|
1906 | if (k < 0) {
|
---|
1907 | sk = i + 1;
|
---|
1908 | k = 0;
|
---|
1909 | }
|
---|
1910 | if (sk > matchState.slideTab[k])
|
---|
1911 | matchState.slideTab[k] = sk;
|
---|
1912 | }
|
---|
1913 | }
|
---|
1914 |
|
---|
1915 | if (matchState.pos > lastPos)
|
---|
1916 | return false;
|
---|
1917 |
|
---|
1918 | for (;;) {
|
---|
1919 | if (++slideNext >= matchState.slideTabSize)
|
---|
1920 | slideNext = 0;
|
---|
1921 | if (matchState.slideTab[slideHead] > 0) {
|
---|
1922 | if (matchState.slideTab[slideHead] - 1 > matchState.slideTab[slideNext])
|
---|
1923 | matchState.slideTab[slideNext] = matchState.slideTab[slideHead] - 1;
|
---|
1924 | matchState.slideTab[slideHead] = 0;
|
---|
1925 | } else {
|
---|
1926 | if (matchState.matchHere())
|
---|
1927 | return true;
|
---|
1928 | }
|
---|
1929 |
|
---|
1930 | if (matchState.pos == lastPos)
|
---|
1931 | break;
|
---|
1932 |
|
---|
1933 | /*
|
---|
1934 | Update the slide table. This code has much in common with
|
---|
1935 | the initialization code.
|
---|
1936 | */
|
---|
1937 | int sk = occ1[BadChar(matchState.in[matchState.pos + minl])];
|
---|
1938 | if (sk == NoOccurrence) {
|
---|
1939 | matchState.slideTab[slideNext] = minl;
|
---|
1940 | } else if (sk > 0) {
|
---|
1941 | int k = slideNext + minl - sk;
|
---|
1942 | if (k >= matchState.slideTabSize)
|
---|
1943 | k -= matchState.slideTabSize;
|
---|
1944 | if (sk > matchState.slideTab[k])
|
---|
1945 | matchState.slideTab[k] = sk;
|
---|
1946 | }
|
---|
1947 | slideHead = slideNext;
|
---|
1948 | ++matchState.pos;
|
---|
1949 | }
|
---|
1950 | return false;
|
---|
1951 | }
|
---|
1952 | #else
|
---|
1953 | bool QRegExpEngine::bruteMatch(QRegExpMatchState &matchState) const
|
---|
1954 | {
|
---|
1955 | while (matchState.pos <= matchState.len) {
|
---|
1956 | if (matchState.matchHere())
|
---|
1957 | return true;
|
---|
1958 | ++matchState.pos;
|
---|
1959 | }
|
---|
1960 | return false;
|
---|
1961 | }
|
---|
1962 | #endif
|
---|
1963 |
|
---|
1964 | /*
|
---|
1965 | Here's the core of the engine. It tries to do a match here and now.
|
---|
1966 | */
|
---|
1967 | bool QRegExpMatchState::matchHere()
|
---|
1968 | {
|
---|
1969 | int ncur = 1, nnext = 0;
|
---|
1970 | int i = 0, j, k, m;
|
---|
1971 | bool stop = false;
|
---|
1972 |
|
---|
1973 | matchLen = -1;
|
---|
1974 | oneTestMatchedLen = -1;
|
---|
1975 | curStack[0] = QRegExpEngine::InitialState;
|
---|
1976 |
|
---|
1977 | int ncap = eng->ncap;
|
---|
1978 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
1979 | if (ncap > 0) {
|
---|
1980 | for (j = 0; j < ncap; j++) {
|
---|
1981 | curCapBegin[j] = EmptyCapture;
|
---|
1982 | curCapEnd[j] = EmptyCapture;
|
---|
1983 | }
|
---|
1984 | }
|
---|
1985 | #endif
|
---|
1986 |
|
---|
1987 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
1988 | while ((ncur > 0 || !sleeping.isEmpty()) && i <= len - pos && !stop)
|
---|
1989 | #else
|
---|
1990 | while (ncur > 0 && i <= len - pos && !stop)
|
---|
1991 | #endif
|
---|
1992 | {
|
---|
1993 | int ch = (i < len - pos) ? in[pos + i].unicode() : 0;
|
---|
1994 | for (j = 0; j < ncur; j++) {
|
---|
1995 | int cur = curStack[j];
|
---|
1996 | const QRegExpAutomatonState &scur = eng->s.at(cur);
|
---|
1997 | const QVector<int> &outs = scur.outs;
|
---|
1998 | for (k = 0; k < outs.size(); k++) {
|
---|
1999 | int next = outs.at(k);
|
---|
2000 | const QRegExpAutomatonState &snext = eng->s.at(next);
|
---|
2001 | bool inside = true;
|
---|
2002 | #if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
|
---|
2003 | int needSomeSleep = 0;
|
---|
2004 | #endif
|
---|
2005 |
|
---|
2006 | /*
|
---|
2007 | First, check if the anchors are anchored properly.
|
---|
2008 | */
|
---|
2009 | int a = scur.anchors.value(next);
|
---|
2010 | if (a != 0 && !testAnchor(i, a, curCapBegin + j * ncap))
|
---|
2011 | inside = false;
|
---|
2012 |
|
---|
2013 | /*
|
---|
2014 | If indeed they are, check if the input character is
|
---|
2015 | correct for this transition.
|
---|
2016 | */
|
---|
2017 | if (inside) {
|
---|
2018 | m = snext.match;
|
---|
2019 | if ((m & (QRegExpEngine::CharClassBit | QRegExpEngine::BackRefBit)) == 0) {
|
---|
2020 | if (eng->cs)
|
---|
2021 | inside = (m == ch);
|
---|
2022 | else
|
---|
2023 | inside = (QChar(m).toLower() == QChar(ch).toLower());
|
---|
2024 | } else if (next == QRegExpEngine::FinalState) {
|
---|
2025 | matchLen = i;
|
---|
2026 | stop = minimal;
|
---|
2027 | inside = true;
|
---|
2028 | } else if ((m & QRegExpEngine::CharClassBit) != 0) {
|
---|
2029 | #ifndef QT_NO_REGEXP_CCLASS
|
---|
2030 | const QRegExpCharClass &cc = eng->cl.at(m ^ QRegExpEngine::CharClassBit);
|
---|
2031 | if (eng->cs)
|
---|
2032 | inside = cc.in(ch);
|
---|
2033 | else if (cc.negative())
|
---|
2034 | inside = cc.in(QChar(ch).toLower()) &&
|
---|
2035 | cc.in(QChar(ch).toUpper());
|
---|
2036 | else
|
---|
2037 | inside = cc.in(QChar(ch).toLower()) ||
|
---|
2038 | cc.in(QChar(ch).toUpper());
|
---|
2039 | #endif
|
---|
2040 | #if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
|
---|
2041 | } else { /* ((m & QRegExpEngine::BackRefBit) != 0) */
|
---|
2042 | int bref = m ^ QRegExpEngine::BackRefBit;
|
---|
2043 | int ell = j * ncap + eng->captureForOfficialCapture.at(bref - 1);
|
---|
2044 |
|
---|
2045 | inside = bref <= ncap && curCapBegin[ell] != EmptyCapture;
|
---|
2046 | if (inside) {
|
---|
2047 | if (eng->cs)
|
---|
2048 | inside = (in[pos + curCapBegin[ell]] == QChar(ch));
|
---|
2049 | else
|
---|
2050 | inside = (in[pos + curCapBegin[ell]].toLower()
|
---|
2051 | == QChar(ch).toLower());
|
---|
2052 | }
|
---|
2053 |
|
---|
2054 | if (inside) {
|
---|
2055 | int delta;
|
---|
2056 | if (curCapEnd[ell] == EmptyCapture)
|
---|
2057 | delta = i - curCapBegin[ell];
|
---|
2058 | else
|
---|
2059 | delta = curCapEnd[ell] - curCapBegin[ell];
|
---|
2060 |
|
---|
2061 | inside = (delta <= len - (pos + i));
|
---|
2062 | if (inside && delta > 1) {
|
---|
2063 | int n = 1;
|
---|
2064 | if (eng->cs) {
|
---|
2065 | while (n < delta) {
|
---|
2066 | if (in[pos + curCapBegin[ell] + n]
|
---|
2067 | != in[pos + i + n])
|
---|
2068 | break;
|
---|
2069 | ++n;
|
---|
2070 | }
|
---|
2071 | } else {
|
---|
2072 | while (n < delta) {
|
---|
2073 | QChar a = in[pos + curCapBegin[ell] + n];
|
---|
2074 | QChar b = in[pos + i + n];
|
---|
2075 | if (a.toLower() != b.toLower())
|
---|
2076 | break;
|
---|
2077 | ++n;
|
---|
2078 | }
|
---|
2079 | }
|
---|
2080 | inside = (n == delta);
|
---|
2081 | if (inside)
|
---|
2082 | needSomeSleep = delta - 1;
|
---|
2083 | }
|
---|
2084 | }
|
---|
2085 | #endif
|
---|
2086 | }
|
---|
2087 | }
|
---|
2088 |
|
---|
2089 | /*
|
---|
2090 | We must now update our data structures.
|
---|
2091 | */
|
---|
2092 | if (inside) {
|
---|
2093 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
2094 | int *capBegin, *capEnd;
|
---|
2095 | #endif
|
---|
2096 | /*
|
---|
2097 | If the next state was not encountered yet, all
|
---|
2098 | is fine.
|
---|
2099 | */
|
---|
2100 | if ((m = inNextStack[next]) == -1) {
|
---|
2101 | m = nnext++;
|
---|
2102 | nextStack[m] = next;
|
---|
2103 | inNextStack[next] = m;
|
---|
2104 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
2105 | capBegin = nextCapBegin + m * ncap;
|
---|
2106 | capEnd = nextCapEnd + m * ncap;
|
---|
2107 |
|
---|
2108 | /*
|
---|
2109 | Otherwise, we'll first maintain captures in
|
---|
2110 | temporary arrays, and decide at the end whether
|
---|
2111 | it's best to keep the previous capture zones or
|
---|
2112 | the new ones.
|
---|
2113 | */
|
---|
2114 | } else {
|
---|
2115 | capBegin = tempCapBegin;
|
---|
2116 | capEnd = tempCapEnd;
|
---|
2117 | #endif
|
---|
2118 | }
|
---|
2119 |
|
---|
2120 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
2121 | /*
|
---|
2122 | Updating the capture zones is much of a task.
|
---|
2123 | */
|
---|
2124 | if (ncap > 0) {
|
---|
2125 | memcpy(capBegin, curCapBegin + j * ncap, ncap * sizeof(int));
|
---|
2126 | memcpy(capEnd, curCapEnd + j * ncap, ncap * sizeof(int));
|
---|
2127 | int c = scur.atom, n = snext.atom;
|
---|
2128 | int p = -1, q = -1;
|
---|
2129 | int cap;
|
---|
2130 |
|
---|
2131 | /*
|
---|
2132 | Lemma 1. For any x in the range [0..nf), we
|
---|
2133 | have f[x].parent < x.
|
---|
2134 |
|
---|
2135 | Proof. By looking at startAtom(), it is
|
---|
2136 | clear that cf < nf holds all the time, and
|
---|
2137 | thus that f[nf].parent < nf.
|
---|
2138 | */
|
---|
2139 |
|
---|
2140 | /*
|
---|
2141 | If we are reentering an atom, we empty all
|
---|
2142 | capture zones inside it.
|
---|
2143 | */
|
---|
2144 | if ((q = scur.reenter.value(next)) != 0) {
|
---|
2145 | QBitArray b(eng->nf, false);
|
---|
2146 | b.setBit(q, true);
|
---|
2147 | for (int ell = q + 1; ell < eng->nf; ell++) {
|
---|
2148 | if (b.testBit(eng->f.at(ell).parent)) {
|
---|
2149 | b.setBit(ell, true);
|
---|
2150 | cap = eng->f.at(ell).capture;
|
---|
2151 | if (cap >= 0) {
|
---|
2152 | capBegin[cap] = EmptyCapture;
|
---|
2153 | capEnd[cap] = EmptyCapture;
|
---|
2154 | }
|
---|
2155 | }
|
---|
2156 | }
|
---|
2157 | p = eng->f.at(q).parent;
|
---|
2158 |
|
---|
2159 | /*
|
---|
2160 | Otherwise, close the capture zones we are
|
---|
2161 | leaving. We are leaving f[c].capture,
|
---|
2162 | f[f[c].parent].capture,
|
---|
2163 | f[f[f[c].parent].parent].capture, ...,
|
---|
2164 | until f[x].capture, with x such that
|
---|
2165 | f[x].parent is the youngest common ancestor
|
---|
2166 | for c and n.
|
---|
2167 |
|
---|
2168 | We go up along c's and n's ancestry until
|
---|
2169 | we find x.
|
---|
2170 | */
|
---|
2171 | } else {
|
---|
2172 | p = c;
|
---|
2173 | q = n;
|
---|
2174 | while (p != q) {
|
---|
2175 | if (p > q) {
|
---|
2176 | cap = eng->f.at(p).capture;
|
---|
2177 | if (cap >= 0) {
|
---|
2178 | if (capBegin[cap] == i) {
|
---|
2179 | capBegin[cap] = EmptyCapture;
|
---|
2180 | capEnd[cap] = EmptyCapture;
|
---|
2181 | } else {
|
---|
2182 | capEnd[cap] = i;
|
---|
2183 | }
|
---|
2184 | }
|
---|
2185 | p = eng->f.at(p).parent;
|
---|
2186 | } else {
|
---|
2187 | q = eng->f.at(q).parent;
|
---|
2188 | }
|
---|
2189 | }
|
---|
2190 | }
|
---|
2191 |
|
---|
2192 | /*
|
---|
2193 | In any case, we now open the capture zones
|
---|
2194 | we are entering. We work upwards from n
|
---|
2195 | until we reach p (the parent of the atom we
|
---|
2196 | reenter or the youngest common ancestor).
|
---|
2197 | */
|
---|
2198 | while (n > p) {
|
---|
2199 | cap = eng->f.at(n).capture;
|
---|
2200 | if (cap >= 0) {
|
---|
2201 | capBegin[cap] = i;
|
---|
2202 | capEnd[cap] = EmptyCapture;
|
---|
2203 | }
|
---|
2204 | n = eng->f.at(n).parent;
|
---|
2205 | }
|
---|
2206 | /*
|
---|
2207 | If the next state was already in
|
---|
2208 | nextStack, we must choose carefully which
|
---|
2209 | capture zones we want to keep.
|
---|
2210 | */
|
---|
2211 | if (capBegin == tempCapBegin &&
|
---|
2212 | isBetterCapture(ncap, capBegin, capEnd, nextCapBegin + m * ncap,
|
---|
2213 | nextCapEnd + m * ncap)) {
|
---|
2214 | memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int));
|
---|
2215 | memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int));
|
---|
2216 | }
|
---|
2217 | }
|
---|
2218 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
2219 | /*
|
---|
2220 | We are done with updating the capture zones.
|
---|
2221 | It's now time to put the next state to sleep,
|
---|
2222 | if it needs to, and to remove it from
|
---|
2223 | nextStack.
|
---|
2224 | */
|
---|
2225 | if (needSomeSleep > 0) {
|
---|
2226 | QVector<int> zzZ(2 + 2 * ncap);
|
---|
2227 | zzZ[0] = i + needSomeSleep;
|
---|
2228 | zzZ[1] = next;
|
---|
2229 | if (ncap > 0) {
|
---|
2230 | memcpy(zzZ.data() + 2, capBegin, ncap * sizeof(int));
|
---|
2231 | memcpy(zzZ.data() + 2 + ncap, capEnd, ncap * sizeof(int));
|
---|
2232 | }
|
---|
2233 | inNextStack[nextStack[--nnext]] = -1;
|
---|
2234 | sleeping.append(zzZ);
|
---|
2235 | }
|
---|
2236 | #endif
|
---|
2237 | #endif
|
---|
2238 | }
|
---|
2239 | }
|
---|
2240 | }
|
---|
2241 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
2242 | /*
|
---|
2243 | If we reached the final state, hurray! Copy the captured
|
---|
2244 | zone.
|
---|
2245 | */
|
---|
2246 | if (ncap > 0 && (m = inNextStack[QRegExpEngine::FinalState]) != -1) {
|
---|
2247 | memcpy(capBegin, nextCapBegin + m * ncap, ncap * sizeof(int));
|
---|
2248 | memcpy(capEnd, nextCapEnd + m * ncap, ncap * sizeof(int));
|
---|
2249 | }
|
---|
2250 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
2251 | /*
|
---|
2252 | It's time to wake up the sleepers.
|
---|
2253 | */
|
---|
2254 | j = 0;
|
---|
2255 | while (j < sleeping.count()) {
|
---|
2256 | if (sleeping.at(j)[0] == i) {
|
---|
2257 | const QVector<int> &zzZ = sleeping.at(j);
|
---|
2258 | int next = zzZ[1];
|
---|
2259 | const int *capBegin = zzZ.data() + 2;
|
---|
2260 | const int *capEnd = zzZ.data() + 2 + ncap;
|
---|
2261 | bool copyOver = true;
|
---|
2262 |
|
---|
2263 | if ((m = inNextStack[next]) == -1) {
|
---|
2264 | m = nnext++;
|
---|
2265 | nextStack[m] = next;
|
---|
2266 | inNextStack[next] = m;
|
---|
2267 | } else {
|
---|
2268 | copyOver = isBetterCapture(ncap, nextCapBegin + m * ncap, nextCapEnd + m * ncap,
|
---|
2269 | capBegin, capEnd);
|
---|
2270 | }
|
---|
2271 | if (copyOver) {
|
---|
2272 | memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int));
|
---|
2273 | memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int));
|
---|
2274 | }
|
---|
2275 |
|
---|
2276 | sleeping.removeAt(j);
|
---|
2277 | } else {
|
---|
2278 | ++j;
|
---|
2279 | }
|
---|
2280 | }
|
---|
2281 | #endif
|
---|
2282 | #endif
|
---|
2283 | for (j = 0; j < nnext; j++)
|
---|
2284 | inNextStack[nextStack[j]] = -1;
|
---|
2285 |
|
---|
2286 | // avoid needless iteration that confuses oneTestMatchedLen
|
---|
2287 | if (nnext == 1 && nextStack[0] == QRegExpEngine::FinalState
|
---|
2288 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
2289 | && sleeping.isEmpty()
|
---|
2290 | #endif
|
---|
2291 | )
|
---|
2292 | stop = true;
|
---|
2293 |
|
---|
2294 | qSwap(curStack, nextStack);
|
---|
2295 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
2296 | qSwap(curCapBegin, nextCapBegin);
|
---|
2297 | qSwap(curCapEnd, nextCapEnd);
|
---|
2298 | #endif
|
---|
2299 | ncur = nnext;
|
---|
2300 | nnext = 0;
|
---|
2301 | ++i;
|
---|
2302 | }
|
---|
2303 |
|
---|
2304 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
2305 | /*
|
---|
2306 | If minimal matching is enabled, we might have some sleepers
|
---|
2307 | left.
|
---|
2308 | */
|
---|
2309 | if (!sleeping.isEmpty())
|
---|
2310 | sleeping.clear();
|
---|
2311 | #endif
|
---|
2312 |
|
---|
2313 | oneTestMatchedLen = i - 1;
|
---|
2314 | return (matchLen >= 0);
|
---|
2315 | }
|
---|
2316 |
|
---|
2317 | #ifndef QT_NO_REGEXP_CCLASS
|
---|
2318 |
|
---|
2319 | QRegExpCharClass::QRegExpCharClass()
|
---|
2320 | : c(0), n(false)
|
---|
2321 | {
|
---|
2322 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2323 | occ1.fill(NoOccurrence, NumBadChars);
|
---|
2324 | #endif
|
---|
2325 | }
|
---|
2326 |
|
---|
2327 | QRegExpCharClass &QRegExpCharClass::operator=(const QRegExpCharClass &cc)
|
---|
2328 | {
|
---|
2329 | c = cc.c;
|
---|
2330 | r = cc.r;
|
---|
2331 | n = cc.n;
|
---|
2332 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2333 | occ1 = cc.occ1;
|
---|
2334 | #endif
|
---|
2335 | return *this;
|
---|
2336 | }
|
---|
2337 |
|
---|
2338 | void QRegExpCharClass::clear()
|
---|
2339 | {
|
---|
2340 | c = 0;
|
---|
2341 | r.resize(0);
|
---|
2342 | n = false;
|
---|
2343 | }
|
---|
2344 |
|
---|
2345 | void QRegExpCharClass::setNegative(bool negative)
|
---|
2346 | {
|
---|
2347 | n = negative;
|
---|
2348 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2349 | occ1.fill(0, NumBadChars);
|
---|
2350 | #endif
|
---|
2351 | }
|
---|
2352 |
|
---|
2353 | void QRegExpCharClass::addCategories(int cats)
|
---|
2354 | {
|
---|
2355 | c |= cats;
|
---|
2356 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2357 | occ1.fill(0, NumBadChars);
|
---|
2358 | #endif
|
---|
2359 | }
|
---|
2360 |
|
---|
2361 | void QRegExpCharClass::addRange(ushort from, ushort to)
|
---|
2362 | {
|
---|
2363 | if (from > to)
|
---|
2364 | qSwap(from, to);
|
---|
2365 | int m = r.size();
|
---|
2366 | r.resize(m + 1);
|
---|
2367 | r[m].from = from;
|
---|
2368 | r[m].len = to - from + 1;
|
---|
2369 |
|
---|
2370 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2371 | int i;
|
---|
2372 |
|
---|
2373 | if (to - from < NumBadChars) {
|
---|
2374 | if (from % NumBadChars <= to % NumBadChars) {
|
---|
2375 | for (i = from % NumBadChars; i <= to % NumBadChars; i++)
|
---|
2376 | occ1[i] = 0;
|
---|
2377 | } else {
|
---|
2378 | for (i = 0; i <= to % NumBadChars; i++)
|
---|
2379 | occ1[i] = 0;
|
---|
2380 | for (i = from % NumBadChars; i < NumBadChars; i++)
|
---|
2381 | occ1[i] = 0;
|
---|
2382 | }
|
---|
2383 | } else {
|
---|
2384 | occ1.fill(0, NumBadChars);
|
---|
2385 | }
|
---|
2386 | #endif
|
---|
2387 | }
|
---|
2388 |
|
---|
2389 | bool QRegExpCharClass::in(QChar ch) const
|
---|
2390 | {
|
---|
2391 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2392 | if (occ1.at(BadChar(ch)) == NoOccurrence)
|
---|
2393 | return n;
|
---|
2394 | #endif
|
---|
2395 |
|
---|
2396 | if (c != 0 && (c & (1 << (int)ch.category())) != 0)
|
---|
2397 | return !n;
|
---|
2398 |
|
---|
2399 | const int uc = ch.unicode();
|
---|
2400 | int size = r.size();
|
---|
2401 |
|
---|
2402 | for (int i = 0; i < size; ++i) {
|
---|
2403 | const QRegExpCharClassRange &range = r.at(i);
|
---|
2404 | if (uint(uc - range.from) < uint(r.at(i).len))
|
---|
2405 | return !n;
|
---|
2406 | }
|
---|
2407 | return n;
|
---|
2408 | }
|
---|
2409 |
|
---|
2410 | #if defined(QT_DEBUG)
|
---|
2411 | void QRegExpCharClass::dump() const
|
---|
2412 | {
|
---|
2413 | int i;
|
---|
2414 | qDebug(" %stive character class", n ? "nega" : "posi");
|
---|
2415 | #ifndef QT_NO_REGEXP_CCLASS
|
---|
2416 | if (c != 0)
|
---|
2417 | qDebug(" categories 0x%.8x", c);
|
---|
2418 | #endif
|
---|
2419 | for (i = 0; i < r.size(); i++)
|
---|
2420 | qDebug(" 0x%.4x through 0x%.4x", r[i].from, r[i].from + r[i].len - 1);
|
---|
2421 | }
|
---|
2422 | #endif
|
---|
2423 | #endif
|
---|
2424 |
|
---|
2425 | QRegExpEngine::Box::Box(QRegExpEngine *engine)
|
---|
2426 | : eng(engine), skipanchors(0)
|
---|
2427 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2428 | , earlyStart(0), lateStart(0), maxl(0)
|
---|
2429 | #endif
|
---|
2430 | {
|
---|
2431 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2432 | occ1.fill(NoOccurrence, NumBadChars);
|
---|
2433 | #endif
|
---|
2434 | minl = 0;
|
---|
2435 | }
|
---|
2436 |
|
---|
2437 | QRegExpEngine::Box &QRegExpEngine::Box::operator=(const Box &b)
|
---|
2438 | {
|
---|
2439 | eng = b.eng;
|
---|
2440 | ls = b.ls;
|
---|
2441 | rs = b.rs;
|
---|
2442 | lanchors = b.lanchors;
|
---|
2443 | ranchors = b.ranchors;
|
---|
2444 | skipanchors = b.skipanchors;
|
---|
2445 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2446 | earlyStart = b.earlyStart;
|
---|
2447 | lateStart = b.lateStart;
|
---|
2448 | str = b.str;
|
---|
2449 | leftStr = b.leftStr;
|
---|
2450 | rightStr = b.rightStr;
|
---|
2451 | maxl = b.maxl;
|
---|
2452 | occ1 = b.occ1;
|
---|
2453 | #endif
|
---|
2454 | minl = b.minl;
|
---|
2455 | return *this;
|
---|
2456 | }
|
---|
2457 |
|
---|
2458 | void QRegExpEngine::Box::set(QChar ch)
|
---|
2459 | {
|
---|
2460 | ls.resize(1);
|
---|
2461 | ls[0] = eng->createState(ch);
|
---|
2462 | rs = ls;
|
---|
2463 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2464 | str = ch;
|
---|
2465 | leftStr = ch;
|
---|
2466 | rightStr = ch;
|
---|
2467 | maxl = 1;
|
---|
2468 | occ1[BadChar(ch)] = 0;
|
---|
2469 | #endif
|
---|
2470 | minl = 1;
|
---|
2471 | }
|
---|
2472 |
|
---|
2473 | void QRegExpEngine::Box::set(const QRegExpCharClass &cc)
|
---|
2474 | {
|
---|
2475 | ls.resize(1);
|
---|
2476 | ls[0] = eng->createState(cc);
|
---|
2477 | rs = ls;
|
---|
2478 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2479 | maxl = 1;
|
---|
2480 | occ1 = cc.firstOccurrence();
|
---|
2481 | #endif
|
---|
2482 | minl = 1;
|
---|
2483 | }
|
---|
2484 |
|
---|
2485 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
2486 | void QRegExpEngine::Box::set(int bref)
|
---|
2487 | {
|
---|
2488 | ls.resize(1);
|
---|
2489 | ls[0] = eng->createState(bref);
|
---|
2490 | rs = ls;
|
---|
2491 | if (bref >= 1 && bref <= MaxBackRefs)
|
---|
2492 | skipanchors = Anchor_BackRef0Empty << bref;
|
---|
2493 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2494 | maxl = InftyLen;
|
---|
2495 | #endif
|
---|
2496 | minl = 0;
|
---|
2497 | }
|
---|
2498 | #endif
|
---|
2499 |
|
---|
2500 | void QRegExpEngine::Box::cat(const Box &b)
|
---|
2501 | {
|
---|
2502 | eng->addCatTransitions(rs, b.ls);
|
---|
2503 | addAnchorsToEngine(b);
|
---|
2504 | if (minl == 0) {
|
---|
2505 | lanchors.unite(b.lanchors);
|
---|
2506 | if (skipanchors != 0) {
|
---|
2507 | for (int i = 0; i < b.ls.size(); i++) {
|
---|
2508 | int a = eng->anchorConcatenation(lanchors.value(b.ls.at(i), 0), skipanchors);
|
---|
2509 | lanchors.insert(b.ls.at(i), a);
|
---|
2510 | }
|
---|
2511 | }
|
---|
2512 | mergeInto(&ls, b.ls);
|
---|
2513 | }
|
---|
2514 | if (b.minl == 0) {
|
---|
2515 | ranchors.unite(b.ranchors);
|
---|
2516 | if (b.skipanchors != 0) {
|
---|
2517 | for (int i = 0; i < rs.size(); i++) {
|
---|
2518 | int a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), b.skipanchors);
|
---|
2519 | ranchors.insert(rs.at(i), a);
|
---|
2520 | }
|
---|
2521 | }
|
---|
2522 | mergeInto(&rs, b.rs);
|
---|
2523 | } else {
|
---|
2524 | ranchors = b.ranchors;
|
---|
2525 | rs = b.rs;
|
---|
2526 | }
|
---|
2527 |
|
---|
2528 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2529 | if (maxl != InftyLen) {
|
---|
2530 | if (rightStr.length() + b.leftStr.length() >
|
---|
2531 | qMax(str.length(), b.str.length())) {
|
---|
2532 | earlyStart = minl - rightStr.length();
|
---|
2533 | lateStart = maxl - rightStr.length();
|
---|
2534 | str = rightStr + b.leftStr;
|
---|
2535 | } else if (b.str.length() > str.length()) {
|
---|
2536 | earlyStart = minl + b.earlyStart;
|
---|
2537 | lateStart = maxl + b.lateStart;
|
---|
2538 | str = b.str;
|
---|
2539 | }
|
---|
2540 | }
|
---|
2541 |
|
---|
2542 | if (leftStr.length() == maxl)
|
---|
2543 | leftStr += b.leftStr;
|
---|
2544 |
|
---|
2545 | if (b.rightStr.length() == b.maxl) {
|
---|
2546 | rightStr += b.rightStr;
|
---|
2547 | } else {
|
---|
2548 | rightStr = b.rightStr;
|
---|
2549 | }
|
---|
2550 |
|
---|
2551 | if (maxl == InftyLen || b.maxl == InftyLen) {
|
---|
2552 | maxl = InftyLen;
|
---|
2553 | } else {
|
---|
2554 | maxl += b.maxl;
|
---|
2555 | }
|
---|
2556 |
|
---|
2557 | for (int i = 0; i < NumBadChars; i++) {
|
---|
2558 | if (b.occ1.at(i) != NoOccurrence && minl + b.occ1.at(i) < occ1.at(i))
|
---|
2559 | occ1[i] = minl + b.occ1.at(i);
|
---|
2560 | }
|
---|
2561 | #endif
|
---|
2562 |
|
---|
2563 | minl += b.minl;
|
---|
2564 | if (minl == 0)
|
---|
2565 | skipanchors = eng->anchorConcatenation(skipanchors, b.skipanchors);
|
---|
2566 | else
|
---|
2567 | skipanchors = 0;
|
---|
2568 | }
|
---|
2569 |
|
---|
2570 | void QRegExpEngine::Box::orx(const Box &b)
|
---|
2571 | {
|
---|
2572 | mergeInto(&ls, b.ls);
|
---|
2573 | lanchors.unite(b.lanchors);
|
---|
2574 | mergeInto(&rs, b.rs);
|
---|
2575 | ranchors.unite(b.ranchors);
|
---|
2576 |
|
---|
2577 | if (b.minl == 0) {
|
---|
2578 | if (minl == 0)
|
---|
2579 | skipanchors = eng->anchorAlternation(skipanchors, b.skipanchors);
|
---|
2580 | else
|
---|
2581 | skipanchors = b.skipanchors;
|
---|
2582 | }
|
---|
2583 |
|
---|
2584 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2585 | for (int i = 0; i < NumBadChars; i++) {
|
---|
2586 | if (occ1.at(i) > b.occ1.at(i))
|
---|
2587 | occ1[i] = b.occ1.at(i);
|
---|
2588 | }
|
---|
2589 | earlyStart = 0;
|
---|
2590 | lateStart = 0;
|
---|
2591 | str = QString();
|
---|
2592 | leftStr = QString();
|
---|
2593 | rightStr = QString();
|
---|
2594 | if (b.maxl > maxl)
|
---|
2595 | maxl = b.maxl;
|
---|
2596 | #endif
|
---|
2597 | if (b.minl < minl)
|
---|
2598 | minl = b.minl;
|
---|
2599 | }
|
---|
2600 |
|
---|
2601 | void QRegExpEngine::Box::plus(int atom)
|
---|
2602 | {
|
---|
2603 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
2604 | eng->addPlusTransitions(rs, ls, atom);
|
---|
2605 | #else
|
---|
2606 | Q_UNUSED(atom);
|
---|
2607 | eng->addCatTransitions(rs, ls);
|
---|
2608 | #endif
|
---|
2609 | addAnchorsToEngine(*this);
|
---|
2610 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2611 | maxl = InftyLen;
|
---|
2612 | #endif
|
---|
2613 | }
|
---|
2614 |
|
---|
2615 | void QRegExpEngine::Box::opt()
|
---|
2616 | {
|
---|
2617 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2618 | earlyStart = 0;
|
---|
2619 | lateStart = 0;
|
---|
2620 | str = QString();
|
---|
2621 | leftStr = QString();
|
---|
2622 | rightStr = QString();
|
---|
2623 | #endif
|
---|
2624 | skipanchors = 0;
|
---|
2625 | minl = 0;
|
---|
2626 | }
|
---|
2627 |
|
---|
2628 | void QRegExpEngine::Box::catAnchor(int a)
|
---|
2629 | {
|
---|
2630 | if (a != 0) {
|
---|
2631 | for (int i = 0; i < rs.size(); i++) {
|
---|
2632 | a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), a);
|
---|
2633 | ranchors.insert(rs.at(i), a);
|
---|
2634 | }
|
---|
2635 | if (minl == 0)
|
---|
2636 | skipanchors = eng->anchorConcatenation(skipanchors, a);
|
---|
2637 | }
|
---|
2638 | }
|
---|
2639 |
|
---|
2640 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
2641 | void QRegExpEngine::Box::setupHeuristics()
|
---|
2642 | {
|
---|
2643 | eng->goodEarlyStart = earlyStart;
|
---|
2644 | eng->goodLateStart = lateStart;
|
---|
2645 | eng->goodStr = eng->cs ? str : str.toLower();
|
---|
2646 |
|
---|
2647 | eng->minl = minl;
|
---|
2648 | if (eng->cs) {
|
---|
2649 | /*
|
---|
2650 | A regular expression such as 112|1 has occ1['2'] = 2 and minl =
|
---|
2651 | 1 at this point. An entry of occ1 has to be at most minl or
|
---|
2652 | infinity for the rest of the algorithm to go well.
|
---|
2653 |
|
---|
2654 | We waited until here before normalizing these cases (instead of
|
---|
2655 | doing it in Box::orx()) because sometimes things improve by
|
---|
2656 | themselves. Consider for example (112|1)34.
|
---|
2657 | */
|
---|
2658 | for (int i = 0; i < NumBadChars; i++) {
|
---|
2659 | if (occ1.at(i) != NoOccurrence && occ1.at(i) >= minl)
|
---|
2660 | occ1[i] = minl;
|
---|
2661 | }
|
---|
2662 | eng->occ1 = occ1;
|
---|
2663 | } else {
|
---|
2664 | eng->occ1.fill(0, NumBadChars);
|
---|
2665 | }
|
---|
2666 |
|
---|
2667 | eng->heuristicallyChooseHeuristic();
|
---|
2668 | }
|
---|
2669 | #endif
|
---|
2670 |
|
---|
2671 | #if defined(QT_DEBUG)
|
---|
2672 | void QRegExpEngine::Box::dump() const
|
---|
2673 | {
|
---|
2674 | int i;
|
---|
2675 | qDebug("Box of at least %d character%s", minl, minl == 1 ? "" : "s");
|
---|
2676 | qDebug(" Left states:");
|
---|
2677 | for (i = 0; i < ls.size(); i++) {
|
---|
2678 | if (lanchors.value(ls[i], 0) == 0)
|
---|
2679 | qDebug(" %d", ls[i]);
|
---|
2680 | else
|
---|
2681 | qDebug(" %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]]);
|
---|
2682 | }
|
---|
2683 | qDebug(" Right states:");
|
---|
2684 | for (i = 0; i < rs.size(); i++) {
|
---|
2685 | if (ranchors.value(rs[i], 0) == 0)
|
---|
2686 | qDebug(" %d", rs[i]);
|
---|
2687 | else
|
---|
2688 | qDebug(" %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]]);
|
---|
2689 | }
|
---|
2690 | qDebug(" Skip anchors: 0x%.8x", skipanchors);
|
---|
2691 | }
|
---|
2692 | #endif
|
---|
2693 |
|
---|
2694 | void QRegExpEngine::Box::addAnchorsToEngine(const Box &to) const
|
---|
2695 | {
|
---|
2696 | for (int i = 0; i < to.ls.size(); i++) {
|
---|
2697 | for (int j = 0; j < rs.size(); j++) {
|
---|
2698 | int a = eng->anchorConcatenation(ranchors.value(rs.at(j), 0),
|
---|
2699 | to.lanchors.value(to.ls.at(i), 0));
|
---|
2700 | eng->addAnchors(rs[j], to.ls[i], a);
|
---|
2701 | }
|
---|
2702 | }
|
---|
2703 | }
|
---|
2704 |
|
---|
2705 | void QRegExpEngine::setupCategoriesRangeMap()
|
---|
2706 | {
|
---|
2707 | categoriesRangeMap.insert("IsBasicLatin", qMakePair(0x0000, 0x007F));
|
---|
2708 | categoriesRangeMap.insert("IsLatin-1Supplement", qMakePair(0x0080, 0x00FF));
|
---|
2709 | categoriesRangeMap.insert("IsLatinExtended-A", qMakePair(0x0100, 0x017F));
|
---|
2710 | categoriesRangeMap.insert("IsLatinExtended-B", qMakePair(0x0180, 0x024F));
|
---|
2711 | categoriesRangeMap.insert("IsIPAExtensions", qMakePair(0x0250, 0x02AF));
|
---|
2712 | categoriesRangeMap.insert("IsSpacingModifierLetters", qMakePair(0x02B0, 0x02FF));
|
---|
2713 | categoriesRangeMap.insert("IsCombiningDiacriticalMarks", qMakePair(0x0300, 0x036F));
|
---|
2714 | categoriesRangeMap.insert("IsGreek", qMakePair(0x0370, 0x03FF));
|
---|
2715 | categoriesRangeMap.insert("IsCyrillic", qMakePair(0x0400, 0x04FF));
|
---|
2716 | categoriesRangeMap.insert("IsCyrillicSupplement", qMakePair(0x0500, 0x052F));
|
---|
2717 | categoriesRangeMap.insert("IsArmenian", qMakePair(0x0530, 0x058F));
|
---|
2718 | categoriesRangeMap.insert("IsHebrew", qMakePair(0x0590, 0x05FF));
|
---|
2719 | categoriesRangeMap.insert("IsArabic", qMakePair(0x0600, 0x06FF));
|
---|
2720 | categoriesRangeMap.insert("IsSyriac", qMakePair(0x0700, 0x074F));
|
---|
2721 | categoriesRangeMap.insert("IsArabicSupplement", qMakePair(0x0750, 0x077F));
|
---|
2722 | categoriesRangeMap.insert("IsThaana", qMakePair(0x0780, 0x07BF));
|
---|
2723 | categoriesRangeMap.insert("IsDevanagari", qMakePair(0x0900, 0x097F));
|
---|
2724 | categoriesRangeMap.insert("IsBengali", qMakePair(0x0980, 0x09FF));
|
---|
2725 | categoriesRangeMap.insert("IsGurmukhi", qMakePair(0x0A00, 0x0A7F));
|
---|
2726 | categoriesRangeMap.insert("IsGujarati", qMakePair(0x0A80, 0x0AFF));
|
---|
2727 | categoriesRangeMap.insert("IsOriya", qMakePair(0x0B00, 0x0B7F));
|
---|
2728 | categoriesRangeMap.insert("IsTamil", qMakePair(0x0B80, 0x0BFF));
|
---|
2729 | categoriesRangeMap.insert("IsTelugu", qMakePair(0x0C00, 0x0C7F));
|
---|
2730 | categoriesRangeMap.insert("IsKannada", qMakePair(0x0C80, 0x0CFF));
|
---|
2731 | categoriesRangeMap.insert("IsMalayalam", qMakePair(0x0D00, 0x0D7F));
|
---|
2732 | categoriesRangeMap.insert("IsSinhala", qMakePair(0x0D80, 0x0DFF));
|
---|
2733 | categoriesRangeMap.insert("IsThai", qMakePair(0x0E00, 0x0E7F));
|
---|
2734 | categoriesRangeMap.insert("IsLao", qMakePair(0x0E80, 0x0EFF));
|
---|
2735 | categoriesRangeMap.insert("IsTibetan", qMakePair(0x0F00, 0x0FFF));
|
---|
2736 | categoriesRangeMap.insert("IsMyanmar", qMakePair(0x1000, 0x109F));
|
---|
2737 | categoriesRangeMap.insert("IsGeorgian", qMakePair(0x10A0, 0x10FF));
|
---|
2738 | categoriesRangeMap.insert("IsHangulJamo", qMakePair(0x1100, 0x11FF));
|
---|
2739 | categoriesRangeMap.insert("IsEthiopic", qMakePair(0x1200, 0x137F));
|
---|
2740 | categoriesRangeMap.insert("IsEthiopicSupplement", qMakePair(0x1380, 0x139F));
|
---|
2741 | categoriesRangeMap.insert("IsCherokee", qMakePair(0x13A0, 0x13FF));
|
---|
2742 | categoriesRangeMap.insert("IsUnifiedCanadianAboriginalSyllabics", qMakePair(0x1400, 0x167F));
|
---|
2743 | categoriesRangeMap.insert("IsOgham", qMakePair(0x1680, 0x169F));
|
---|
2744 | categoriesRangeMap.insert("IsRunic", qMakePair(0x16A0, 0x16FF));
|
---|
2745 | categoriesRangeMap.insert("IsTagalog", qMakePair(0x1700, 0x171F));
|
---|
2746 | categoriesRangeMap.insert("IsHanunoo", qMakePair(0x1720, 0x173F));
|
---|
2747 | categoriesRangeMap.insert("IsBuhid", qMakePair(0x1740, 0x175F));
|
---|
2748 | categoriesRangeMap.insert("IsTagbanwa", qMakePair(0x1760, 0x177F));
|
---|
2749 | categoriesRangeMap.insert("IsKhmer", qMakePair(0x1780, 0x17FF));
|
---|
2750 | categoriesRangeMap.insert("IsMongolian", qMakePair(0x1800, 0x18AF));
|
---|
2751 | categoriesRangeMap.insert("IsLimbu", qMakePair(0x1900, 0x194F));
|
---|
2752 | categoriesRangeMap.insert("IsTaiLe", qMakePair(0x1950, 0x197F));
|
---|
2753 | categoriesRangeMap.insert("IsNewTaiLue", qMakePair(0x1980, 0x19DF));
|
---|
2754 | categoriesRangeMap.insert("IsKhmerSymbols", qMakePair(0x19E0, 0x19FF));
|
---|
2755 | categoriesRangeMap.insert("IsBuginese", qMakePair(0x1A00, 0x1A1F));
|
---|
2756 | categoriesRangeMap.insert("IsPhoneticExtensions", qMakePair(0x1D00, 0x1D7F));
|
---|
2757 | categoriesRangeMap.insert("IsPhoneticExtensionsSupplement", qMakePair(0x1D80, 0x1DBF));
|
---|
2758 | categoriesRangeMap.insert("IsCombiningDiacriticalMarksSupplement", qMakePair(0x1DC0, 0x1DFF));
|
---|
2759 | categoriesRangeMap.insert("IsLatinExtendedAdditional", qMakePair(0x1E00, 0x1EFF));
|
---|
2760 | categoriesRangeMap.insert("IsGreekExtended", qMakePair(0x1F00, 0x1FFF));
|
---|
2761 | categoriesRangeMap.insert("IsGeneralPunctuation", qMakePair(0x2000, 0x206F));
|
---|
2762 | categoriesRangeMap.insert("IsSuperscriptsandSubscripts", qMakePair(0x2070, 0x209F));
|
---|
2763 | categoriesRangeMap.insert("IsCurrencySymbols", qMakePair(0x20A0, 0x20CF));
|
---|
2764 | categoriesRangeMap.insert("IsCombiningMarksforSymbols", qMakePair(0x20D0, 0x20FF));
|
---|
2765 | categoriesRangeMap.insert("IsLetterlikeSymbols", qMakePair(0x2100, 0x214F));
|
---|
2766 | categoriesRangeMap.insert("IsNumberForms", qMakePair(0x2150, 0x218F));
|
---|
2767 | categoriesRangeMap.insert("IsArrows", qMakePair(0x2190, 0x21FF));
|
---|
2768 | categoriesRangeMap.insert("IsMathematicalOperators", qMakePair(0x2200, 0x22FF));
|
---|
2769 | categoriesRangeMap.insert("IsMiscellaneousTechnical", qMakePair(0x2300, 0x23FF));
|
---|
2770 | categoriesRangeMap.insert("IsControlPictures", qMakePair(0x2400, 0x243F));
|
---|
2771 | categoriesRangeMap.insert("IsOpticalCharacterRecognition", qMakePair(0x2440, 0x245F));
|
---|
2772 | categoriesRangeMap.insert("IsEnclosedAlphanumerics", qMakePair(0x2460, 0x24FF));
|
---|
2773 | categoriesRangeMap.insert("IsBoxDrawing", qMakePair(0x2500, 0x257F));
|
---|
2774 | categoriesRangeMap.insert("IsBlockElements", qMakePair(0x2580, 0x259F));
|
---|
2775 | categoriesRangeMap.insert("IsGeometricShapes", qMakePair(0x25A0, 0x25FF));
|
---|
2776 | categoriesRangeMap.insert("IsMiscellaneousSymbols", qMakePair(0x2600, 0x26FF));
|
---|
2777 | categoriesRangeMap.insert("IsDingbats", qMakePair(0x2700, 0x27BF));
|
---|
2778 | categoriesRangeMap.insert("IsMiscellaneousMathematicalSymbols-A", qMakePair(0x27C0, 0x27EF));
|
---|
2779 | categoriesRangeMap.insert("IsSupplementalArrows-A", qMakePair(0x27F0, 0x27FF));
|
---|
2780 | categoriesRangeMap.insert("IsBraillePatterns", qMakePair(0x2800, 0x28FF));
|
---|
2781 | categoriesRangeMap.insert("IsSupplementalArrows-B", qMakePair(0x2900, 0x297F));
|
---|
2782 | categoriesRangeMap.insert("IsMiscellaneousMathematicalSymbols-B", qMakePair(0x2980, 0x29FF));
|
---|
2783 | categoriesRangeMap.insert("IsSupplementalMathematicalOperators", qMakePair(0x2A00, 0x2AFF));
|
---|
2784 | categoriesRangeMap.insert("IsMiscellaneousSymbolsandArrows", qMakePair(0x2B00, 0x2BFF));
|
---|
2785 | categoriesRangeMap.insert("IsGlagolitic", qMakePair(0x2C00, 0x2C5F));
|
---|
2786 | categoriesRangeMap.insert("IsCoptic", qMakePair(0x2C80, 0x2CFF));
|
---|
2787 | categoriesRangeMap.insert("IsGeorgianSupplement", qMakePair(0x2D00, 0x2D2F));
|
---|
2788 | categoriesRangeMap.insert("IsTifinagh", qMakePair(0x2D30, 0x2D7F));
|
---|
2789 | categoriesRangeMap.insert("IsEthiopicExtended", qMakePair(0x2D80, 0x2DDF));
|
---|
2790 | categoriesRangeMap.insert("IsSupplementalPunctuation", qMakePair(0x2E00, 0x2E7F));
|
---|
2791 | categoriesRangeMap.insert("IsCJKRadicalsSupplement", qMakePair(0x2E80, 0x2EFF));
|
---|
2792 | categoriesRangeMap.insert("IsKangxiRadicals", qMakePair(0x2F00, 0x2FDF));
|
---|
2793 | categoriesRangeMap.insert("IsIdeographicDescriptionCharacters", qMakePair(0x2FF0, 0x2FFF));
|
---|
2794 | categoriesRangeMap.insert("IsCJKSymbolsandPunctuation", qMakePair(0x3000, 0x303F));
|
---|
2795 | categoriesRangeMap.insert("IsHiragana", qMakePair(0x3040, 0x309F));
|
---|
2796 | categoriesRangeMap.insert("IsKatakana", qMakePair(0x30A0, 0x30FF));
|
---|
2797 | categoriesRangeMap.insert("IsBopomofo", qMakePair(0x3100, 0x312F));
|
---|
2798 | categoriesRangeMap.insert("IsHangulCompatibilityJamo", qMakePair(0x3130, 0x318F));
|
---|
2799 | categoriesRangeMap.insert("IsKanbun", qMakePair(0x3190, 0x319F));
|
---|
2800 | categoriesRangeMap.insert("IsBopomofoExtended", qMakePair(0x31A0, 0x31BF));
|
---|
2801 | categoriesRangeMap.insert("IsCJKStrokes", qMakePair(0x31C0, 0x31EF));
|
---|
2802 | categoriesRangeMap.insert("IsKatakanaPhoneticExtensions", qMakePair(0x31F0, 0x31FF));
|
---|
2803 | categoriesRangeMap.insert("IsEnclosedCJKLettersandMonths", qMakePair(0x3200, 0x32FF));
|
---|
2804 | categoriesRangeMap.insert("IsCJKCompatibility", qMakePair(0x3300, 0x33FF));
|
---|
2805 | categoriesRangeMap.insert("IsCJKUnifiedIdeographsExtensionA", qMakePair(0x3400, 0x4DB5));
|
---|
2806 | categoriesRangeMap.insert("IsYijingHexagramSymbols", qMakePair(0x4DC0, 0x4DFF));
|
---|
2807 | categoriesRangeMap.insert("IsCJKUnifiedIdeographs", qMakePair(0x4E00, 0x9FFF));
|
---|
2808 | categoriesRangeMap.insert("IsYiSyllables", qMakePair(0xA000, 0xA48F));
|
---|
2809 | categoriesRangeMap.insert("IsYiRadicals", qMakePair(0xA490, 0xA4CF));
|
---|
2810 | categoriesRangeMap.insert("IsModifierToneLetters", qMakePair(0xA700, 0xA71F));
|
---|
2811 | categoriesRangeMap.insert("IsSylotiNagri", qMakePair(0xA800, 0xA82F));
|
---|
2812 | categoriesRangeMap.insert("IsHangulSyllables", qMakePair(0xAC00, 0xD7A3));
|
---|
2813 | categoriesRangeMap.insert("IsPrivateUse", qMakePair(0xE000, 0xF8FF));
|
---|
2814 | categoriesRangeMap.insert("IsCJKCompatibilityIdeographs", qMakePair(0xF900, 0xFAFF));
|
---|
2815 | categoriesRangeMap.insert("IsAlphabeticPresentationForms", qMakePair(0xFB00, 0xFB4F));
|
---|
2816 | categoriesRangeMap.insert("IsArabicPresentationForms-A", qMakePair(0xFB50, 0xFDFF));
|
---|
2817 | categoriesRangeMap.insert("IsVariationSelectors", qMakePair(0xFE00, 0xFE0F));
|
---|
2818 | categoriesRangeMap.insert("IsVerticalForms", qMakePair(0xFE10, 0xFE1F));
|
---|
2819 | categoriesRangeMap.insert("IsCombiningHalfMarks", qMakePair(0xFE20, 0xFE2F));
|
---|
2820 | categoriesRangeMap.insert("IsCJKCompatibilityForms", qMakePair(0xFE30, 0xFE4F));
|
---|
2821 | categoriesRangeMap.insert("IsSmallFormVariants", qMakePair(0xFE50, 0xFE6F));
|
---|
2822 | categoriesRangeMap.insert("IsArabicPresentationForms-B", qMakePair(0xFE70, 0xFEFF));
|
---|
2823 | categoriesRangeMap.insert("IsHalfwidthandFullwidthForms", qMakePair(0xFF00, 0xFFEF));
|
---|
2824 | categoriesRangeMap.insert("IsSpecials", qMakePair(0xFFF0, 0xFFFF));
|
---|
2825 | categoriesRangeMap.insert("IsLinearBSyllabary", qMakePair(0x10000, 0x1007F));
|
---|
2826 | categoriesRangeMap.insert("IsLinearBIdeograms", qMakePair(0x10080, 0x100FF));
|
---|
2827 | categoriesRangeMap.insert("IsAegeanNumbers", qMakePair(0x10100, 0x1013F));
|
---|
2828 | categoriesRangeMap.insert("IsAncientGreekNumbers", qMakePair(0x10140, 0x1018F));
|
---|
2829 | categoriesRangeMap.insert("IsOldItalic", qMakePair(0x10300, 0x1032F));
|
---|
2830 | categoriesRangeMap.insert("IsGothic", qMakePair(0x10330, 0x1034F));
|
---|
2831 | categoriesRangeMap.insert("IsUgaritic", qMakePair(0x10380, 0x1039F));
|
---|
2832 | categoriesRangeMap.insert("IsOldPersian", qMakePair(0x103A0, 0x103DF));
|
---|
2833 | categoriesRangeMap.insert("IsDeseret", qMakePair(0x10400, 0x1044F));
|
---|
2834 | categoriesRangeMap.insert("IsShavian", qMakePair(0x10450, 0x1047F));
|
---|
2835 | categoriesRangeMap.insert("IsOsmanya", qMakePair(0x10480, 0x104AF));
|
---|
2836 | categoriesRangeMap.insert("IsCypriotSyllabary", qMakePair(0x10800, 0x1083F));
|
---|
2837 | categoriesRangeMap.insert("IsKharoshthi", qMakePair(0x10A00, 0x10A5F));
|
---|
2838 | categoriesRangeMap.insert("IsByzantineMusicalSymbols", qMakePair(0x1D000, 0x1D0FF));
|
---|
2839 | categoriesRangeMap.insert("IsMusicalSymbols", qMakePair(0x1D100, 0x1D1FF));
|
---|
2840 | categoriesRangeMap.insert("IsAncientGreekMusicalNotation", qMakePair(0x1D200, 0x1D24F));
|
---|
2841 | categoriesRangeMap.insert("IsTaiXuanJingSymbols", qMakePair(0x1D300, 0x1D35F));
|
---|
2842 | categoriesRangeMap.insert("IsMathematicalAlphanumericSymbols", qMakePair(0x1D400, 0x1D7FF));
|
---|
2843 | categoriesRangeMap.insert("IsCJKUnifiedIdeographsExtensionB", qMakePair(0x20000, 0x2A6DF));
|
---|
2844 | categoriesRangeMap.insert("IsCJKCompatibilityIdeographsSupplement", qMakePair(0x2F800, 0x2FA1F));
|
---|
2845 | categoriesRangeMap.insert("IsTags", qMakePair(0xE0000, 0xE007F));
|
---|
2846 | categoriesRangeMap.insert("IsVariationSelectorsSupplement", qMakePair(0xE0100, 0xE01EF));
|
---|
2847 | categoriesRangeMap.insert("IsSupplementaryPrivateUseArea-A", qMakePair(0xF0000, 0xFFFFF));
|
---|
2848 | categoriesRangeMap.insert("IsSupplementaryPrivateUseArea-B", qMakePair(0x100000, 0x10FFFF));
|
---|
2849 | }
|
---|
2850 |
|
---|
2851 | int QRegExpEngine::getChar()
|
---|
2852 | {
|
---|
2853 | return (yyPos == yyLen) ? EOS : yyIn[yyPos++].unicode();
|
---|
2854 | }
|
---|
2855 |
|
---|
2856 | int QRegExpEngine::getEscape()
|
---|
2857 | {
|
---|
2858 | #ifndef QT_NO_REGEXP_ESCAPE
|
---|
2859 | const char tab[] = "afnrtv"; // no b, as \b means word boundary
|
---|
2860 | const char backTab[] = "\a\f\n\r\t\v";
|
---|
2861 | ushort low;
|
---|
2862 | int i;
|
---|
2863 | #endif
|
---|
2864 | ushort val;
|
---|
2865 | int prevCh = yyCh;
|
---|
2866 |
|
---|
2867 | if (prevCh == EOS) {
|
---|
2868 | error(RXERR_END);
|
---|
2869 | return Tok_Char | '\\';
|
---|
2870 | }
|
---|
2871 | yyCh = getChar();
|
---|
2872 | #ifndef QT_NO_REGEXP_ESCAPE
|
---|
2873 | if ((prevCh & ~0xff) == 0) {
|
---|
2874 | const char *p = strchr(tab, prevCh);
|
---|
2875 | if (p != 0)
|
---|
2876 | return Tok_Char | backTab[p - tab];
|
---|
2877 | }
|
---|
2878 | #endif
|
---|
2879 |
|
---|
2880 | switch (prevCh) {
|
---|
2881 | #ifndef QT_NO_REGEXP_ESCAPE
|
---|
2882 | case '0':
|
---|
2883 | val = 0;
|
---|
2884 | for (i = 0; i < 3; i++) {
|
---|
2885 | if (yyCh >= '0' && yyCh <= '7')
|
---|
2886 | val = (val << 3) | (yyCh - '0');
|
---|
2887 | else
|
---|
2888 | break;
|
---|
2889 | yyCh = getChar();
|
---|
2890 | }
|
---|
2891 | if ((val & ~0377) != 0)
|
---|
2892 | error(RXERR_OCTAL);
|
---|
2893 | return Tok_Char | val;
|
---|
2894 | #endif
|
---|
2895 | #ifndef QT_NO_REGEXP_ESCAPE
|
---|
2896 | case 'B':
|
---|
2897 | return Tok_NonWord;
|
---|
2898 | #endif
|
---|
2899 | #ifndef QT_NO_REGEXP_CCLASS
|
---|
2900 | case 'D':
|
---|
2901 | // see QChar::isDigit()
|
---|
2902 | yyCharClass->addCategories(0x7fffffef);
|
---|
2903 | return Tok_CharClass;
|
---|
2904 | case 'S':
|
---|
2905 | // see QChar::isSpace()
|
---|
2906 | yyCharClass->addCategories(0x7ffff87f);
|
---|
2907 | yyCharClass->addRange(0x0000, 0x0008);
|
---|
2908 | yyCharClass->addRange(0x000e, 0x001f);
|
---|
2909 | yyCharClass->addRange(0x007f, 0x009f);
|
---|
2910 | return Tok_CharClass;
|
---|
2911 | case 'W':
|
---|
2912 | // see QChar::isLetterOrNumber() and QChar::isMark()
|
---|
2913 | yyCharClass->addCategories(0x7fe07f81);
|
---|
2914 | yyCharClass->addRange(0x203f, 0x2040);
|
---|
2915 | yyCharClass->addSingleton(0x2040);
|
---|
2916 | yyCharClass->addSingleton(0x2054);
|
---|
2917 | yyCharClass->addSingleton(0x30fb);
|
---|
2918 | yyCharClass->addRange(0xfe33, 0xfe34);
|
---|
2919 | yyCharClass->addRange(0xfe4d, 0xfe4f);
|
---|
2920 | yyCharClass->addSingleton(0xff3f);
|
---|
2921 | yyCharClass->addSingleton(0xff65);
|
---|
2922 | return Tok_CharClass;
|
---|
2923 | #endif
|
---|
2924 | #ifndef QT_NO_REGEXP_ESCAPE
|
---|
2925 | case 'b':
|
---|
2926 | return Tok_Word;
|
---|
2927 | #endif
|
---|
2928 | #ifndef QT_NO_REGEXP_CCLASS
|
---|
2929 | case 'd':
|
---|
2930 | // see QChar::isDigit()
|
---|
2931 | yyCharClass->addCategories(0x00000010);
|
---|
2932 | return Tok_CharClass;
|
---|
2933 | case 's':
|
---|
2934 | // see QChar::isSpace()
|
---|
2935 | yyCharClass->addCategories(0x00000380);
|
---|
2936 | yyCharClass->addRange(0x0009, 0x000d);
|
---|
2937 | return Tok_CharClass;
|
---|
2938 | case 'w':
|
---|
2939 | // see QChar::isLetterOrNumber() and QChar::isMark()
|
---|
2940 | yyCharClass->addCategories(0x000f807e);
|
---|
2941 | yyCharClass->addSingleton(0x005f); // '_'
|
---|
2942 | return Tok_CharClass;
|
---|
2943 | case 'I':
|
---|
2944 | if (xmlSchemaExtensions) {
|
---|
2945 | yyCharClass->setNegative(!yyCharClass->negative());
|
---|
2946 | // fall through
|
---|
2947 | }
|
---|
2948 | case 'i':
|
---|
2949 | if (xmlSchemaExtensions) {
|
---|
2950 | yyCharClass->addCategories(0x000f807e);
|
---|
2951 | yyCharClass->addSingleton(0x003a); // ':'
|
---|
2952 | yyCharClass->addSingleton(0x005f); // '_'
|
---|
2953 | yyCharClass->addRange(0x0041, 0x005a); // [A-Z]
|
---|
2954 | yyCharClass->addRange(0x0061, 0x007a); // [a-z]
|
---|
2955 | yyCharClass->addRange(0xc0, 0xd6);
|
---|
2956 | yyCharClass->addRange(0xd8, 0xf6);
|
---|
2957 | yyCharClass->addRange(0xf8, 0x2ff);
|
---|
2958 | yyCharClass->addRange(0x370, 0x37d);
|
---|
2959 | yyCharClass->addRange(0x37f, 0x1fff);
|
---|
2960 | yyCharClass->addRange(0x200c, 0x200d);
|
---|
2961 | yyCharClass->addRange(0x2070, 0x218f);
|
---|
2962 | yyCharClass->addRange(0x2c00, 0x2fef);
|
---|
2963 | yyCharClass->addRange(0x3001, 0xd7ff);
|
---|
2964 | yyCharClass->addRange(0xf900, 0xfdcf);
|
---|
2965 | yyCharClass->addRange(0xfdf0, 0xfffd);
|
---|
2966 | yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff);
|
---|
2967 | }
|
---|
2968 | return Tok_CharClass;
|
---|
2969 | case 'C':
|
---|
2970 | if (xmlSchemaExtensions) {
|
---|
2971 | yyCharClass->setNegative(!yyCharClass->negative());
|
---|
2972 | // fall through
|
---|
2973 | }
|
---|
2974 | case 'c':
|
---|
2975 | if (xmlSchemaExtensions) {
|
---|
2976 | yyCharClass->addCategories(0x000f807e);
|
---|
2977 | yyCharClass->addSingleton(0x002d); // '-'
|
---|
2978 | yyCharClass->addSingleton(0x002e); // '.'
|
---|
2979 | yyCharClass->addSingleton(0x003a); // ':'
|
---|
2980 | yyCharClass->addSingleton(0x005f); // '_'
|
---|
2981 | yyCharClass->addSingleton(0xb7);
|
---|
2982 | yyCharClass->addRange(0x0030, 0x0039); // [0-9]
|
---|
2983 | yyCharClass->addRange(0x0041, 0x005a); // [A-Z]
|
---|
2984 | yyCharClass->addRange(0x0061, 0x007a); // [a-z]
|
---|
2985 | yyCharClass->addRange(0xc0, 0xd6);
|
---|
2986 | yyCharClass->addRange(0xd8, 0xf6);
|
---|
2987 | yyCharClass->addRange(0xf8, 0x2ff);
|
---|
2988 | yyCharClass->addRange(0x370, 0x37d);
|
---|
2989 | yyCharClass->addRange(0x37f, 0x1fff);
|
---|
2990 | yyCharClass->addRange(0x200c, 0x200d);
|
---|
2991 | yyCharClass->addRange(0x2070, 0x218f);
|
---|
2992 | yyCharClass->addRange(0x2c00, 0x2fef);
|
---|
2993 | yyCharClass->addRange(0x3001, 0xd7ff);
|
---|
2994 | yyCharClass->addRange(0xf900, 0xfdcf);
|
---|
2995 | yyCharClass->addRange(0xfdf0, 0xfffd);
|
---|
2996 | yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff);
|
---|
2997 | yyCharClass->addRange(0x0300, 0x036f);
|
---|
2998 | yyCharClass->addRange(0x203f, 0x2040);
|
---|
2999 | }
|
---|
3000 | return Tok_CharClass;
|
---|
3001 | case 'P':
|
---|
3002 | if (xmlSchemaExtensions) {
|
---|
3003 | yyCharClass->setNegative(!yyCharClass->negative());
|
---|
3004 | // fall through
|
---|
3005 | }
|
---|
3006 | case 'p':
|
---|
3007 | if (xmlSchemaExtensions) {
|
---|
3008 | if (yyCh != '{') {
|
---|
3009 | error(RXERR_CHARCLASS);
|
---|
3010 | return Tok_CharClass;
|
---|
3011 | }
|
---|
3012 |
|
---|
3013 | QByteArray category;
|
---|
3014 | yyCh = getChar();
|
---|
3015 | while (yyCh != '}') {
|
---|
3016 | if (yyCh == EOS) {
|
---|
3017 | error(RXERR_END);
|
---|
3018 | return Tok_CharClass;
|
---|
3019 | }
|
---|
3020 | category.append(yyCh);
|
---|
3021 | yyCh = getChar();
|
---|
3022 | }
|
---|
3023 | yyCh = getChar(); // skip closing '}'
|
---|
3024 |
|
---|
3025 | if (category == "M") {
|
---|
3026 | yyCharClass->addCategories(0x0000000e);
|
---|
3027 | } else if (category == "Mn") {
|
---|
3028 | yyCharClass->addCategories(0x00000002);
|
---|
3029 | } else if (category == "Mc") {
|
---|
3030 | yyCharClass->addCategories(0x00000004);
|
---|
3031 | } else if (category == "Me") {
|
---|
3032 | yyCharClass->addCategories(0x00000008);
|
---|
3033 | } else if (category == "N") {
|
---|
3034 | yyCharClass->addCategories(0x00000070);
|
---|
3035 | } else if (category == "Nd") {
|
---|
3036 | yyCharClass->addCategories(0x00000010);
|
---|
3037 | } else if (category == "Nl") {
|
---|
3038 | yyCharClass->addCategories(0x00000020);
|
---|
3039 | } else if (category == "No") {
|
---|
3040 | yyCharClass->addCategories(0x00000040);
|
---|
3041 | } else if (category == "Z") {
|
---|
3042 | yyCharClass->addCategories(0x00000380);
|
---|
3043 | } else if (category == "Zs") {
|
---|
3044 | yyCharClass->addCategories(0x00000080);
|
---|
3045 | } else if (category == "Zl") {
|
---|
3046 | yyCharClass->addCategories(0x00000100);
|
---|
3047 | } else if (category == "Zp") {
|
---|
3048 | yyCharClass->addCategories(0x00000200);
|
---|
3049 | } else if (category == "C") {
|
---|
3050 | yyCharClass->addCategories(0x00006c00);
|
---|
3051 | } else if (category == "Cc") {
|
---|
3052 | yyCharClass->addCategories(0x00000400);
|
---|
3053 | } else if (category == "Cf") {
|
---|
3054 | yyCharClass->addCategories(0x00000800);
|
---|
3055 | } else if (category == "Cs") {
|
---|
3056 | yyCharClass->addCategories(0x00001000);
|
---|
3057 | } else if (category == "Co") {
|
---|
3058 | yyCharClass->addCategories(0x00002000);
|
---|
3059 | } else if (category == "Cn") {
|
---|
3060 | yyCharClass->addCategories(0x00004000);
|
---|
3061 | } else if (category == "L") {
|
---|
3062 | yyCharClass->addCategories(0x000f8000);
|
---|
3063 | } else if (category == "Lu") {
|
---|
3064 | yyCharClass->addCategories(0x00008000);
|
---|
3065 | } else if (category == "Ll") {
|
---|
3066 | yyCharClass->addCategories(0x00010000);
|
---|
3067 | } else if (category == "Lt") {
|
---|
3068 | yyCharClass->addCategories(0x00020000);
|
---|
3069 | } else if (category == "Lm") {
|
---|
3070 | yyCharClass->addCategories(0x00040000);
|
---|
3071 | } else if (category == "Lo") {
|
---|
3072 | yyCharClass->addCategories(0x00080000);
|
---|
3073 | } else if (category == "P") {
|
---|
3074 | yyCharClass->addCategories(0x4f580780);
|
---|
3075 | } else if (category == "Pc") {
|
---|
3076 | yyCharClass->addCategories(0x00100000);
|
---|
3077 | } else if (category == "Pd") {
|
---|
3078 | yyCharClass->addCategories(0x00200000);
|
---|
3079 | } else if (category == "Ps") {
|
---|
3080 | yyCharClass->addCategories(0x00400000);
|
---|
3081 | } else if (category == "Pe") {
|
---|
3082 | yyCharClass->addCategories(0x00800000);
|
---|
3083 | } else if (category == "Pi") {
|
---|
3084 | yyCharClass->addCategories(0x01000000);
|
---|
3085 | } else if (category == "Pf") {
|
---|
3086 | yyCharClass->addCategories(0x02000000);
|
---|
3087 | } else if (category == "Po") {
|
---|
3088 | yyCharClass->addCategories(0x04000000);
|
---|
3089 | } else if (category == "S") {
|
---|
3090 | yyCharClass->addCategories(0x78000000);
|
---|
3091 | } else if (category == "Sm") {
|
---|
3092 | yyCharClass->addCategories(0x08000000);
|
---|
3093 | } else if (category == "Sc") {
|
---|
3094 | yyCharClass->addCategories(0x10000000);
|
---|
3095 | } else if (category == "Sk") {
|
---|
3096 | yyCharClass->addCategories(0x20000000);
|
---|
3097 | } else if (category == "So") {
|
---|
3098 | yyCharClass->addCategories(0x40000000);
|
---|
3099 | } else if (category.startsWith("Is")) {
|
---|
3100 | if (categoriesRangeMap.isEmpty())
|
---|
3101 | setupCategoriesRangeMap();
|
---|
3102 |
|
---|
3103 | if (categoriesRangeMap.contains(category)) {
|
---|
3104 | const QPair<int, int> range = categoriesRangeMap.value(category);
|
---|
3105 | yyCharClass->addRange(range.first, range.second);
|
---|
3106 | } else {
|
---|
3107 | error(RXERR_CATEGORY);
|
---|
3108 | }
|
---|
3109 | } else {
|
---|
3110 | error(RXERR_CATEGORY);
|
---|
3111 | }
|
---|
3112 | }
|
---|
3113 | return Tok_CharClass;
|
---|
3114 | #endif
|
---|
3115 | #ifndef QT_NO_REGEXP_ESCAPE
|
---|
3116 | case 'x':
|
---|
3117 | val = 0;
|
---|
3118 | for (i = 0; i < 4; i++) {
|
---|
3119 | low = QChar(yyCh).toLower().unicode();
|
---|
3120 | if (low >= '0' && low <= '9')
|
---|
3121 | val = (val << 4) | (low - '0');
|
---|
3122 | else if (low >= 'a' && low <= 'f')
|
---|
3123 | val = (val << 4) | (low - 'a' + 10);
|
---|
3124 | else
|
---|
3125 | break;
|
---|
3126 | yyCh = getChar();
|
---|
3127 | }
|
---|
3128 | return Tok_Char | val;
|
---|
3129 | #endif
|
---|
3130 | default:
|
---|
3131 | if (prevCh >= '1' && prevCh <= '9') {
|
---|
3132 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
3133 | val = prevCh - '0';
|
---|
3134 | while (yyCh >= '0' && yyCh <= '9') {
|
---|
3135 | val = (val * 10) + (yyCh - '0');
|
---|
3136 | yyCh = getChar();
|
---|
3137 | }
|
---|
3138 | return Tok_BackRef | val;
|
---|
3139 | #else
|
---|
3140 | error(RXERR_DISABLED);
|
---|
3141 | #endif
|
---|
3142 | }
|
---|
3143 | return Tok_Char | prevCh;
|
---|
3144 | }
|
---|
3145 | }
|
---|
3146 |
|
---|
3147 | #ifndef QT_NO_REGEXP_INTERVAL
|
---|
3148 | int QRegExpEngine::getRep(int def)
|
---|
3149 | {
|
---|
3150 | if (yyCh >= '0' && yyCh <= '9') {
|
---|
3151 | int rep = 0;
|
---|
3152 | do {
|
---|
3153 | rep = 10 * rep + yyCh - '0';
|
---|
3154 | if (rep >= InftyRep) {
|
---|
3155 | error(RXERR_REPETITION);
|
---|
3156 | rep = def;
|
---|
3157 | }
|
---|
3158 | yyCh = getChar();
|
---|
3159 | } while (yyCh >= '0' && yyCh <= '9');
|
---|
3160 | return rep;
|
---|
3161 | } else {
|
---|
3162 | return def;
|
---|
3163 | }
|
---|
3164 | }
|
---|
3165 | #endif
|
---|
3166 |
|
---|
3167 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
---|
3168 | void QRegExpEngine::skipChars(int n)
|
---|
3169 | {
|
---|
3170 | if (n > 0) {
|
---|
3171 | yyPos += n - 1;
|
---|
3172 | yyCh = getChar();
|
---|
3173 | }
|
---|
3174 | }
|
---|
3175 | #endif
|
---|
3176 |
|
---|
3177 | void QRegExpEngine::error(const char *msg)
|
---|
3178 | {
|
---|
3179 | if (yyError.isEmpty())
|
---|
3180 | yyError = QLatin1String(msg);
|
---|
3181 | }
|
---|
3182 |
|
---|
3183 | void QRegExpEngine::startTokenizer(const QChar *rx, int len)
|
---|
3184 | {
|
---|
3185 | yyIn = rx;
|
---|
3186 | yyPos0 = 0;
|
---|
3187 | yyPos = 0;
|
---|
3188 | yyLen = len;
|
---|
3189 | yyCh = getChar();
|
---|
3190 | yyCharClass.reset(new QRegExpCharClass);
|
---|
3191 | yyMinRep = 0;
|
---|
3192 | yyMaxRep = 0;
|
---|
3193 | yyError = QString();
|
---|
3194 | }
|
---|
3195 |
|
---|
3196 | int QRegExpEngine::getToken()
|
---|
3197 | {
|
---|
3198 | #ifndef QT_NO_REGEXP_CCLASS
|
---|
3199 | ushort pendingCh = 0;
|
---|
3200 | bool charPending;
|
---|
3201 | bool rangePending;
|
---|
3202 | int tok;
|
---|
3203 | #endif
|
---|
3204 | int prevCh = yyCh;
|
---|
3205 |
|
---|
3206 | yyPos0 = yyPos - 1;
|
---|
3207 | #ifndef QT_NO_REGEXP_CCLASS
|
---|
3208 | yyCharClass->clear();
|
---|
3209 | #endif
|
---|
3210 | yyMinRep = 0;
|
---|
3211 | yyMaxRep = 0;
|
---|
3212 | yyCh = getChar();
|
---|
3213 |
|
---|
3214 | switch (prevCh) {
|
---|
3215 | case EOS:
|
---|
3216 | yyPos0 = yyPos;
|
---|
3217 | return Tok_Eos;
|
---|
3218 | case '$':
|
---|
3219 | return Tok_Dollar;
|
---|
3220 | case '(':
|
---|
3221 | if (yyCh == '?') {
|
---|
3222 | prevCh = getChar();
|
---|
3223 | yyCh = getChar();
|
---|
3224 | switch (prevCh) {
|
---|
3225 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
---|
3226 | case '!':
|
---|
3227 | return Tok_NegLookahead;
|
---|
3228 | case '=':
|
---|
3229 | return Tok_PosLookahead;
|
---|
3230 | #endif
|
---|
3231 | case ':':
|
---|
3232 | return Tok_MagicLeftParen;
|
---|
3233 | default:
|
---|
3234 | error(RXERR_LOOKAHEAD);
|
---|
3235 | return Tok_MagicLeftParen;
|
---|
3236 | }
|
---|
3237 | } else {
|
---|
3238 | return Tok_LeftParen;
|
---|
3239 | }
|
---|
3240 | case ')':
|
---|
3241 | return Tok_RightParen;
|
---|
3242 | case '*':
|
---|
3243 | yyMinRep = 0;
|
---|
3244 | yyMaxRep = InftyRep;
|
---|
3245 | return Tok_Quantifier;
|
---|
3246 | case '+':
|
---|
3247 | yyMinRep = 1;
|
---|
3248 | yyMaxRep = InftyRep;
|
---|
3249 | return Tok_Quantifier;
|
---|
3250 | case '.':
|
---|
3251 | #ifndef QT_NO_REGEXP_CCLASS
|
---|
3252 | yyCharClass->setNegative(true);
|
---|
3253 | #endif
|
---|
3254 | return Tok_CharClass;
|
---|
3255 | case '?':
|
---|
3256 | yyMinRep = 0;
|
---|
3257 | yyMaxRep = 1;
|
---|
3258 | return Tok_Quantifier;
|
---|
3259 | case '[':
|
---|
3260 | #ifndef QT_NO_REGEXP_CCLASS
|
---|
3261 | if (yyCh == '^') {
|
---|
3262 | yyCharClass->setNegative(true);
|
---|
3263 | yyCh = getChar();
|
---|
3264 | }
|
---|
3265 | charPending = false;
|
---|
3266 | rangePending = false;
|
---|
3267 | do {
|
---|
3268 | if (yyCh == '-' && charPending && !rangePending) {
|
---|
3269 | rangePending = true;
|
---|
3270 | yyCh = getChar();
|
---|
3271 | } else {
|
---|
3272 | if (charPending && !rangePending) {
|
---|
3273 | yyCharClass->addSingleton(pendingCh);
|
---|
3274 | charPending = false;
|
---|
3275 | }
|
---|
3276 | if (yyCh == '\\') {
|
---|
3277 | yyCh = getChar();
|
---|
3278 | tok = getEscape();
|
---|
3279 | if (tok == Tok_Word)
|
---|
3280 | tok = '\b';
|
---|
3281 | } else {
|
---|
3282 | tok = Tok_Char | yyCh;
|
---|
3283 | yyCh = getChar();
|
---|
3284 | }
|
---|
3285 | if (tok == Tok_CharClass) {
|
---|
3286 | if (rangePending) {
|
---|
3287 | yyCharClass->addSingleton('-');
|
---|
3288 | yyCharClass->addSingleton(pendingCh);
|
---|
3289 | charPending = false;
|
---|
3290 | rangePending = false;
|
---|
3291 | }
|
---|
3292 | } else if ((tok & Tok_Char) != 0) {
|
---|
3293 | if (rangePending) {
|
---|
3294 | yyCharClass->addRange(pendingCh, tok ^ Tok_Char);
|
---|
3295 | charPending = false;
|
---|
3296 | rangePending = false;
|
---|
3297 | } else {
|
---|
3298 | pendingCh = tok ^ Tok_Char;
|
---|
3299 | charPending = true;
|
---|
3300 | }
|
---|
3301 | } else {
|
---|
3302 | error(RXERR_CHARCLASS);
|
---|
3303 | }
|
---|
3304 | }
|
---|
3305 | } while (yyCh != ']' && yyCh != EOS);
|
---|
3306 | if (rangePending)
|
---|
3307 | yyCharClass->addSingleton('-');
|
---|
3308 | if (charPending)
|
---|
3309 | yyCharClass->addSingleton(pendingCh);
|
---|
3310 | if (yyCh == EOS)
|
---|
3311 | error(RXERR_END);
|
---|
3312 | else
|
---|
3313 | yyCh = getChar();
|
---|
3314 | return Tok_CharClass;
|
---|
3315 | #else
|
---|
3316 | error(RXERR_END);
|
---|
3317 | return Tok_Char | '[';
|
---|
3318 | #endif
|
---|
3319 | case '\\':
|
---|
3320 | return getEscape();
|
---|
3321 | case ']':
|
---|
3322 | error(RXERR_LEFTDELIM);
|
---|
3323 | return Tok_Char | ']';
|
---|
3324 | case '^':
|
---|
3325 | return Tok_Caret;
|
---|
3326 | case '{':
|
---|
3327 | #ifndef QT_NO_REGEXP_INTERVAL
|
---|
3328 | yyMinRep = getRep(0);
|
---|
3329 | yyMaxRep = yyMinRep;
|
---|
3330 | if (yyCh == ',') {
|
---|
3331 | yyCh = getChar();
|
---|
3332 | yyMaxRep = getRep(InftyRep);
|
---|
3333 | }
|
---|
3334 | if (yyMaxRep < yyMinRep)
|
---|
3335 | error(RXERR_INTERVAL);
|
---|
3336 | if (yyCh != '}')
|
---|
3337 | error(RXERR_REPETITION);
|
---|
3338 | yyCh = getChar();
|
---|
3339 | return Tok_Quantifier;
|
---|
3340 | #else
|
---|
3341 | error(RXERR_DISABLED);
|
---|
3342 | return Tok_Char | '{';
|
---|
3343 | #endif
|
---|
3344 | case '|':
|
---|
3345 | return Tok_Bar;
|
---|
3346 | case '}':
|
---|
3347 | error(RXERR_LEFTDELIM);
|
---|
3348 | return Tok_Char | '}';
|
---|
3349 | default:
|
---|
3350 | return Tok_Char | prevCh;
|
---|
3351 | }
|
---|
3352 | }
|
---|
3353 |
|
---|
3354 | int QRegExpEngine::parse(const QChar *pattern, int len)
|
---|
3355 | {
|
---|
3356 | valid = true;
|
---|
3357 | startTokenizer(pattern, len);
|
---|
3358 | yyTok = getToken();
|
---|
3359 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
3360 | yyMayCapture = true;
|
---|
3361 | #else
|
---|
3362 | yyMayCapture = false;
|
---|
3363 | #endif
|
---|
3364 |
|
---|
3365 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
3366 | int atom = startAtom(false);
|
---|
3367 | #endif
|
---|
3368 | QRegExpCharClass anything;
|
---|
3369 | Box box(this); // create InitialState
|
---|
3370 | box.set(anything);
|
---|
3371 | Box rightBox(this); // create FinalState
|
---|
3372 | rightBox.set(anything);
|
---|
3373 |
|
---|
3374 | Box middleBox(this);
|
---|
3375 | parseExpression(&middleBox);
|
---|
3376 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
3377 | finishAtom(atom, false);
|
---|
3378 | #endif
|
---|
3379 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
3380 | middleBox.setupHeuristics();
|
---|
3381 | #endif
|
---|
3382 | box.cat(middleBox);
|
---|
3383 | box.cat(rightBox);
|
---|
3384 | yyCharClass.reset(0);
|
---|
3385 |
|
---|
3386 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
3387 | for (int i = 0; i < nf; ++i) {
|
---|
3388 | switch (f[i].capture) {
|
---|
3389 | case QRegExpAtom::NoCapture:
|
---|
3390 | break;
|
---|
3391 | case QRegExpAtom::OfficialCapture:
|
---|
3392 | f[i].capture = ncap;
|
---|
3393 | captureForOfficialCapture.append(ncap);
|
---|
3394 | ++ncap;
|
---|
3395 | ++officialncap;
|
---|
3396 | break;
|
---|
3397 | case QRegExpAtom::UnofficialCapture:
|
---|
3398 | f[i].capture = greedyQuantifiers ? ncap++ : QRegExpAtom::NoCapture;
|
---|
3399 | }
|
---|
3400 | }
|
---|
3401 |
|
---|
3402 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
3403 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
3404 | if (officialncap == 0 && nbrefs == 0) {
|
---|
3405 | ncap = nf = 0;
|
---|
3406 | f.clear();
|
---|
3407 | }
|
---|
3408 | #endif
|
---|
3409 | // handle the case where there's a \5 with no corresponding capture
|
---|
3410 | // (captureForOfficialCapture.size() != officialncap)
|
---|
3411 | for (int i = 0; i < nbrefs - officialncap; ++i) {
|
---|
3412 | captureForOfficialCapture.append(ncap);
|
---|
3413 | ++ncap;
|
---|
3414 | }
|
---|
3415 | #endif
|
---|
3416 | #endif
|
---|
3417 |
|
---|
3418 | if (!yyError.isEmpty())
|
---|
3419 | return -1;
|
---|
3420 |
|
---|
3421 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
3422 | const QRegExpAutomatonState &sinit = s.at(InitialState);
|
---|
3423 | caretAnchored = !sinit.anchors.isEmpty();
|
---|
3424 | if (caretAnchored) {
|
---|
3425 | const QMap<int, int> &anchors = sinit.anchors;
|
---|
3426 | QMap<int, int>::const_iterator a;
|
---|
3427 | for (a = anchors.constBegin(); a != anchors.constEnd(); ++a) {
|
---|
3428 | if (
|
---|
3429 | #ifndef QT_NO_REGEXP_ANCHOR_ALT
|
---|
3430 | (*a & Anchor_Alternation) != 0 ||
|
---|
3431 | #endif
|
---|
3432 | (*a & Anchor_Caret) == 0)
|
---|
3433 | {
|
---|
3434 | caretAnchored = false;
|
---|
3435 | break;
|
---|
3436 | }
|
---|
3437 | }
|
---|
3438 | }
|
---|
3439 | #endif
|
---|
3440 |
|
---|
3441 | // cleanup anchors
|
---|
3442 | int numStates = s.count();
|
---|
3443 | for (int i = 0; i < numStates; ++i) {
|
---|
3444 | QRegExpAutomatonState &state = s[i];
|
---|
3445 | if (!state.anchors.isEmpty()) {
|
---|
3446 | QMap<int, int>::iterator a = state.anchors.begin();
|
---|
3447 | while (a != state.anchors.end()) {
|
---|
3448 | if (a.value() == 0)
|
---|
3449 | a = state.anchors.erase(a);
|
---|
3450 | else
|
---|
3451 | ++a;
|
---|
3452 | }
|
---|
3453 | }
|
---|
3454 | }
|
---|
3455 |
|
---|
3456 | return yyPos0;
|
---|
3457 | }
|
---|
3458 |
|
---|
3459 | void QRegExpEngine::parseAtom(Box *box)
|
---|
3460 | {
|
---|
3461 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
---|
3462 | QRegExpEngine *eng = 0;
|
---|
3463 | bool neg;
|
---|
3464 | int len;
|
---|
3465 | #endif
|
---|
3466 |
|
---|
3467 | if ((yyTok & Tok_Char) != 0) {
|
---|
3468 | box->set(QChar(yyTok ^ Tok_Char));
|
---|
3469 | } else {
|
---|
3470 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
3471 | trivial = false;
|
---|
3472 | #endif
|
---|
3473 | switch (yyTok) {
|
---|
3474 | case Tok_Dollar:
|
---|
3475 | box->catAnchor(Anchor_Dollar);
|
---|
3476 | break;
|
---|
3477 | case Tok_Caret:
|
---|
3478 | box->catAnchor(Anchor_Caret);
|
---|
3479 | break;
|
---|
3480 | #ifndef QT_NO_REGEXP_LOOKAHEAD
|
---|
3481 | case Tok_PosLookahead:
|
---|
3482 | case Tok_NegLookahead:
|
---|
3483 | neg = (yyTok == Tok_NegLookahead);
|
---|
3484 | eng = new QRegExpEngine(cs, greedyQuantifiers);
|
---|
3485 | len = eng->parse(yyIn + yyPos - 1, yyLen - yyPos + 1);
|
---|
3486 | if (len >= 0)
|
---|
3487 | skipChars(len);
|
---|
3488 | else
|
---|
3489 | error(RXERR_LOOKAHEAD);
|
---|
3490 | box->catAnchor(addLookahead(eng, neg));
|
---|
3491 | yyTok = getToken();
|
---|
3492 | if (yyTok != Tok_RightParen)
|
---|
3493 | error(RXERR_LOOKAHEAD);
|
---|
3494 | break;
|
---|
3495 | #endif
|
---|
3496 | #ifndef QT_NO_REGEXP_ESCAPE
|
---|
3497 | case Tok_Word:
|
---|
3498 | box->catAnchor(Anchor_Word);
|
---|
3499 | break;
|
---|
3500 | case Tok_NonWord:
|
---|
3501 | box->catAnchor(Anchor_NonWord);
|
---|
3502 | break;
|
---|
3503 | #endif
|
---|
3504 | case Tok_LeftParen:
|
---|
3505 | case Tok_MagicLeftParen:
|
---|
3506 | yyTok = getToken();
|
---|
3507 | parseExpression(box);
|
---|
3508 | if (yyTok != Tok_RightParen)
|
---|
3509 | error(RXERR_END);
|
---|
3510 | break;
|
---|
3511 | case Tok_CharClass:
|
---|
3512 | box->set(*yyCharClass);
|
---|
3513 | break;
|
---|
3514 | case Tok_Quantifier:
|
---|
3515 | error(RXERR_REPETITION);
|
---|
3516 | break;
|
---|
3517 | default:
|
---|
3518 | #ifndef QT_NO_REGEXP_BACKREF
|
---|
3519 | if ((yyTok & Tok_BackRef) != 0)
|
---|
3520 | box->set(yyTok ^ Tok_BackRef);
|
---|
3521 | else
|
---|
3522 | #endif
|
---|
3523 | error(RXERR_DISABLED);
|
---|
3524 | }
|
---|
3525 | }
|
---|
3526 | yyTok = getToken();
|
---|
3527 | }
|
---|
3528 |
|
---|
3529 | void QRegExpEngine::parseFactor(Box *box)
|
---|
3530 | {
|
---|
3531 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
3532 | int outerAtom = greedyQuantifiers ? startAtom(false) : -1;
|
---|
3533 | int innerAtom = startAtom(yyMayCapture && yyTok == Tok_LeftParen);
|
---|
3534 | bool magicLeftParen = (yyTok == Tok_MagicLeftParen);
|
---|
3535 | #else
|
---|
3536 | const int innerAtom = -1;
|
---|
3537 | #endif
|
---|
3538 |
|
---|
3539 | #ifndef QT_NO_REGEXP_INTERVAL
|
---|
3540 | #define YYREDO() \
|
---|
3541 | yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \
|
---|
3542 | *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok
|
---|
3543 |
|
---|
3544 | const QChar *in = yyIn;
|
---|
3545 | int pos0 = yyPos0;
|
---|
3546 | int pos = yyPos;
|
---|
3547 | int len = yyLen;
|
---|
3548 | int ch = yyCh;
|
---|
3549 | QRegExpCharClass charClass;
|
---|
3550 | if (yyTok == Tok_CharClass)
|
---|
3551 | charClass = *yyCharClass;
|
---|
3552 | int tok = yyTok;
|
---|
3553 | bool mayCapture = yyMayCapture;
|
---|
3554 | #endif
|
---|
3555 |
|
---|
3556 | parseAtom(box);
|
---|
3557 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
3558 | finishAtom(innerAtom, magicLeftParen);
|
---|
3559 | #endif
|
---|
3560 |
|
---|
3561 | bool hasQuantifier = (yyTok == Tok_Quantifier);
|
---|
3562 | if (hasQuantifier) {
|
---|
3563 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
3564 | trivial = false;
|
---|
3565 | #endif
|
---|
3566 | if (yyMaxRep == InftyRep) {
|
---|
3567 | box->plus(innerAtom);
|
---|
3568 | #ifndef QT_NO_REGEXP_INTERVAL
|
---|
3569 | } else if (yyMaxRep == 0) {
|
---|
3570 | box->clear();
|
---|
3571 | #endif
|
---|
3572 | }
|
---|
3573 | if (yyMinRep == 0)
|
---|
3574 | box->opt();
|
---|
3575 |
|
---|
3576 | #ifndef QT_NO_REGEXP_INTERVAL
|
---|
3577 | yyMayCapture = false;
|
---|
3578 | int alpha = (yyMinRep == 0) ? 0 : yyMinRep - 1;
|
---|
3579 | int beta = (yyMaxRep == InftyRep) ? 0 : yyMaxRep - (alpha + 1);
|
---|
3580 |
|
---|
3581 | Box rightBox(this);
|
---|
3582 | int i;
|
---|
3583 |
|
---|
3584 | for (i = 0; i < beta; i++) {
|
---|
3585 | YYREDO();
|
---|
3586 | Box leftBox(this);
|
---|
3587 | parseAtom(&leftBox);
|
---|
3588 | leftBox.cat(rightBox);
|
---|
3589 | leftBox.opt();
|
---|
3590 | rightBox = leftBox;
|
---|
3591 | }
|
---|
3592 | for (i = 0; i < alpha; i++) {
|
---|
3593 | YYREDO();
|
---|
3594 | Box leftBox(this);
|
---|
3595 | parseAtom(&leftBox);
|
---|
3596 | leftBox.cat(rightBox);
|
---|
3597 | rightBox = leftBox;
|
---|
3598 | }
|
---|
3599 | rightBox.cat(*box);
|
---|
3600 | *box = rightBox;
|
---|
3601 | #endif
|
---|
3602 | yyTok = getToken();
|
---|
3603 | #ifndef QT_NO_REGEXP_INTERVAL
|
---|
3604 | yyMayCapture = mayCapture;
|
---|
3605 | #endif
|
---|
3606 | }
|
---|
3607 | #undef YYREDO
|
---|
3608 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
3609 | if (greedyQuantifiers)
|
---|
3610 | finishAtom(outerAtom, hasQuantifier);
|
---|
3611 | #endif
|
---|
3612 | }
|
---|
3613 |
|
---|
3614 | void QRegExpEngine::parseTerm(Box *box)
|
---|
3615 | {
|
---|
3616 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
3617 | if (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar)
|
---|
3618 | parseFactor(box);
|
---|
3619 | #endif
|
---|
3620 | while (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) {
|
---|
3621 | Box rightBox(this);
|
---|
3622 | parseFactor(&rightBox);
|
---|
3623 | box->cat(rightBox);
|
---|
3624 | }
|
---|
3625 | }
|
---|
3626 |
|
---|
3627 | void QRegExpEngine::parseExpression(Box *box)
|
---|
3628 | {
|
---|
3629 | parseTerm(box);
|
---|
3630 | while (yyTok == Tok_Bar) {
|
---|
3631 | #ifndef QT_NO_REGEXP_OPTIM
|
---|
3632 | trivial = false;
|
---|
3633 | #endif
|
---|
3634 | Box rightBox(this);
|
---|
3635 | yyTok = getToken();
|
---|
3636 | parseTerm(&rightBox);
|
---|
3637 | box->orx(rightBox);
|
---|
3638 | }
|
---|
3639 | }
|
---|
3640 |
|
---|
3641 | /*
|
---|
3642 | The struct QRegExpPrivate contains the private data of a regular
|
---|
3643 | expression other than the automaton. It makes it possible for many
|
---|
3644 | QRegExp objects to use the same QRegExpEngine object with different
|
---|
3645 | QRegExpPrivate objects.
|
---|
3646 | */
|
---|
3647 | struct QRegExpPrivate
|
---|
3648 | {
|
---|
3649 | QRegExpEngine *eng;
|
---|
3650 | QRegExpEngineKey engineKey;
|
---|
3651 | bool minimal;
|
---|
3652 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
3653 | QString t; // last string passed to QRegExp::indexIn() or lastIndexIn()
|
---|
3654 | QStringList capturedCache; // what QRegExp::capturedTexts() returned last
|
---|
3655 | #endif
|
---|
3656 | QRegExpMatchState matchState;
|
---|
3657 |
|
---|
3658 | inline QRegExpPrivate()
|
---|
3659 | : eng(0), engineKey(QString(), QRegExp::RegExp, Qt::CaseSensitive), minimal(false) { }
|
---|
3660 | inline QRegExpPrivate(const QRegExpEngineKey &key)
|
---|
3661 | : eng(0), engineKey(key), minimal(false) {}
|
---|
3662 | };
|
---|
3663 |
|
---|
3664 | #if !defined(QT_NO_REGEXP_OPTIM)
|
---|
3665 | uint qHash(const QRegExpEngineKey &key)
|
---|
3666 | {
|
---|
3667 | return qHash(key.pattern);
|
---|
3668 | }
|
---|
3669 |
|
---|
3670 | typedef QCache<QRegExpEngineKey, QRegExpEngine> EngineCache;
|
---|
3671 | Q_GLOBAL_STATIC(EngineCache, globalEngineCache)
|
---|
3672 | Q_GLOBAL_STATIC(QMutex, mutex)
|
---|
3673 | #endif // QT_NO_REGEXP_OPTIM
|
---|
3674 |
|
---|
3675 | static void derefEngine(QRegExpEngine *eng, const QRegExpEngineKey &key)
|
---|
3676 | {
|
---|
3677 | if (!eng->ref.deref()) {
|
---|
3678 | #if !defined(QT_NO_REGEXP_OPTIM)
|
---|
3679 | if (globalEngineCache()) {
|
---|
3680 | QMutexLocker locker(mutex());
|
---|
3681 | QT_TRY {
|
---|
3682 | globalEngineCache()->insert(key, eng, 4 + key.pattern.length() / 4);
|
---|
3683 | } QT_CATCH(const std::bad_alloc &) {
|
---|
3684 | // in case of an exception (e.g. oom), just delete the engine
|
---|
3685 | delete eng;
|
---|
3686 | }
|
---|
3687 | } else {
|
---|
3688 | delete eng;
|
---|
3689 | }
|
---|
3690 | #else
|
---|
3691 | Q_UNUSED(key);
|
---|
3692 | delete eng;
|
---|
3693 | #endif
|
---|
3694 | }
|
---|
3695 | }
|
---|
3696 |
|
---|
3697 | static void prepareEngine_helper(QRegExpPrivate *priv)
|
---|
3698 | {
|
---|
3699 | bool initMatchState = !priv->eng;
|
---|
3700 | #if !defined(QT_NO_REGEXP_OPTIM)
|
---|
3701 | if (!priv->eng && globalEngineCache()) {
|
---|
3702 | QMutexLocker locker(mutex());
|
---|
3703 | priv->eng = globalEngineCache()->take(priv->engineKey);
|
---|
3704 | if (priv->eng != 0)
|
---|
3705 | priv->eng->ref.ref();
|
---|
3706 | }
|
---|
3707 | #endif // QT_NO_REGEXP_OPTIM
|
---|
3708 |
|
---|
3709 | if (!priv->eng)
|
---|
3710 | priv->eng = new QRegExpEngine(priv->engineKey);
|
---|
3711 |
|
---|
3712 | if (initMatchState)
|
---|
3713 | priv->matchState.prepareForMatch(priv->eng);
|
---|
3714 | }
|
---|
3715 |
|
---|
3716 | inline static void prepareEngine(QRegExpPrivate *priv)
|
---|
3717 | {
|
---|
3718 | if (priv->eng)
|
---|
3719 | return;
|
---|
3720 | prepareEngine_helper(priv);
|
---|
3721 | }
|
---|
3722 |
|
---|
3723 | static void prepareEngineForMatch(QRegExpPrivate *priv, const QString &str)
|
---|
3724 | {
|
---|
3725 | prepareEngine(priv);
|
---|
3726 | priv->matchState.prepareForMatch(priv->eng);
|
---|
3727 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
3728 | priv->t = str;
|
---|
3729 | priv->capturedCache.clear();
|
---|
3730 | #else
|
---|
3731 | Q_UNUSED(str);
|
---|
3732 | #endif
|
---|
3733 | }
|
---|
3734 |
|
---|
3735 | static void invalidateEngine(QRegExpPrivate *priv)
|
---|
3736 | {
|
---|
3737 | if (priv->eng != 0) {
|
---|
3738 | derefEngine(priv->eng, priv->engineKey);
|
---|
3739 | priv->eng = 0;
|
---|
3740 | priv->matchState.drain();
|
---|
3741 | }
|
---|
3742 | }
|
---|
3743 |
|
---|
3744 | /*!
|
---|
3745 | \enum QRegExp::CaretMode
|
---|
3746 |
|
---|
3747 | The CaretMode enum defines the different meanings of the caret
|
---|
3748 | (\bold{^}) in a regular expression. The possible values are:
|
---|
3749 |
|
---|
3750 | \value CaretAtZero
|
---|
3751 | The caret corresponds to index 0 in the searched string.
|
---|
3752 |
|
---|
3753 | \value CaretAtOffset
|
---|
3754 | The caret corresponds to the start offset of the search.
|
---|
3755 |
|
---|
3756 | \value CaretWontMatch
|
---|
3757 | The caret never matches.
|
---|
3758 | */
|
---|
3759 |
|
---|
3760 | /*!
|
---|
3761 | \enum QRegExp::PatternSyntax
|
---|
3762 |
|
---|
3763 | The syntax used to interpret the meaning of the pattern.
|
---|
3764 |
|
---|
3765 | \value RegExp A rich Perl-like pattern matching syntax. This is
|
---|
3766 | the default.
|
---|
3767 |
|
---|
3768 | \value RegExp2 Like RegExp, but with \l{greedy quantifiers}. This
|
---|
3769 | will be the default in Qt 5. (Introduced in Qt 4.2.)
|
---|
3770 |
|
---|
3771 | \value Wildcard This provides a simple pattern matching syntax
|
---|
3772 | similar to that used by shells (command interpreters) for "file
|
---|
3773 | globbing". See \l{Wildcard Matching}.
|
---|
3774 |
|
---|
3775 | \value WildcardUnix This is similar to Wildcard but with the
|
---|
3776 | behavior of a Unix shell. The wildcard characters can be escaped
|
---|
3777 | with the character "\".
|
---|
3778 |
|
---|
3779 | \value FixedString The pattern is a fixed string. This is
|
---|
3780 | equivalent to using the RegExp pattern on a string in
|
---|
3781 | which all metacharacters are escaped using escape().
|
---|
3782 |
|
---|
3783 | \value W3CXmlSchema11 The pattern is a regular expression as
|
---|
3784 | defined by the W3C XML Schema 1.1 specification.
|
---|
3785 |
|
---|
3786 | \sa setPatternSyntax()
|
---|
3787 | */
|
---|
3788 |
|
---|
3789 | /*!
|
---|
3790 | Constructs an empty regexp.
|
---|
3791 |
|
---|
3792 | \sa isValid(), errorString()
|
---|
3793 | */
|
---|
3794 | QRegExp::QRegExp()
|
---|
3795 | {
|
---|
3796 | priv = new QRegExpPrivate;
|
---|
3797 | }
|
---|
3798 |
|
---|
3799 | /*!
|
---|
3800 | Constructs a regular expression object for the given \a pattern
|
---|
3801 | string. The pattern must be given using wildcard notation if \a
|
---|
3802 | syntax is \l Wildcard; the default is \l RegExp. The pattern is
|
---|
3803 | case sensitive, unless \a cs is Qt::CaseInsensitive. Matching is
|
---|
3804 | greedy (maximal), but can be changed by calling
|
---|
3805 | setMinimal().
|
---|
3806 |
|
---|
3807 | \sa setPattern(), setCaseSensitivity(), setPatternSyntax()
|
---|
3808 | */
|
---|
3809 | QRegExp::QRegExp(const QString &pattern, Qt::CaseSensitivity cs, PatternSyntax syntax)
|
---|
3810 | {
|
---|
3811 | priv = new QRegExpPrivate(QRegExpEngineKey(pattern, syntax, cs));
|
---|
3812 | }
|
---|
3813 |
|
---|
3814 | /*!
|
---|
3815 | Constructs a regular expression as a copy of \a rx.
|
---|
3816 |
|
---|
3817 | \sa operator=()
|
---|
3818 | */
|
---|
3819 | QRegExp::QRegExp(const QRegExp &rx)
|
---|
3820 | {
|
---|
3821 | priv = new QRegExpPrivate;
|
---|
3822 | operator=(rx);
|
---|
3823 | }
|
---|
3824 |
|
---|
3825 | /*!
|
---|
3826 | Destroys the regular expression and cleans up its internal data.
|
---|
3827 | */
|
---|
3828 | QRegExp::~QRegExp()
|
---|
3829 | {
|
---|
3830 | invalidateEngine(priv);
|
---|
3831 | delete priv;
|
---|
3832 | }
|
---|
3833 |
|
---|
3834 | /*!
|
---|
3835 | Copies the regular expression \a rx and returns a reference to the
|
---|
3836 | copy. The case sensitivity, wildcard, and minimal matching options
|
---|
3837 | are also copied.
|
---|
3838 | */
|
---|
3839 | QRegExp &QRegExp::operator=(const QRegExp &rx)
|
---|
3840 | {
|
---|
3841 | prepareEngine(rx.priv); // to allow sharing
|
---|
3842 | QRegExpEngine *otherEng = rx.priv->eng;
|
---|
3843 | if (otherEng)
|
---|
3844 | otherEng->ref.ref();
|
---|
3845 | invalidateEngine(priv);
|
---|
3846 | priv->eng = otherEng;
|
---|
3847 | priv->engineKey = rx.priv->engineKey;
|
---|
3848 | priv->minimal = rx.priv->minimal;
|
---|
3849 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
3850 | priv->t = rx.priv->t;
|
---|
3851 | priv->capturedCache = rx.priv->capturedCache;
|
---|
3852 | #endif
|
---|
3853 | if (priv->eng)
|
---|
3854 | priv->matchState.prepareForMatch(priv->eng);
|
---|
3855 | priv->matchState.captured = rx.priv->matchState.captured;
|
---|
3856 | return *this;
|
---|
3857 | }
|
---|
3858 |
|
---|
3859 | /*!
|
---|
3860 | Returns true if this regular expression is equal to \a rx;
|
---|
3861 | otherwise returns false.
|
---|
3862 |
|
---|
3863 | Two QRegExp objects are equal if they have the same pattern
|
---|
3864 | strings and the same settings for case sensitivity, wildcard and
|
---|
3865 | minimal matching.
|
---|
3866 | */
|
---|
3867 | bool QRegExp::operator==(const QRegExp &rx) const
|
---|
3868 | {
|
---|
3869 | return priv->engineKey == rx.priv->engineKey && priv->minimal == rx.priv->minimal;
|
---|
3870 | }
|
---|
3871 |
|
---|
3872 | /*!
|
---|
3873 | \fn bool QRegExp::operator!=(const QRegExp &rx) const
|
---|
3874 |
|
---|
3875 | Returns true if this regular expression is not equal to \a rx;
|
---|
3876 | otherwise returns false.
|
---|
3877 |
|
---|
3878 | \sa operator==()
|
---|
3879 | */
|
---|
3880 |
|
---|
3881 | /*!
|
---|
3882 | Returns true if the pattern string is empty; otherwise returns
|
---|
3883 | false.
|
---|
3884 |
|
---|
3885 | If you call exactMatch() with an empty pattern on an empty string
|
---|
3886 | it will return true; otherwise it returns false since it operates
|
---|
3887 | over the whole string. If you call indexIn() with an empty pattern
|
---|
3888 | on \e any string it will return the start offset (0 by default)
|
---|
3889 | because the empty pattern matches the 'emptiness' at the start of
|
---|
3890 | the string. In this case the length of the match returned by
|
---|
3891 | matchedLength() will be 0.
|
---|
3892 |
|
---|
3893 | See QString::isEmpty().
|
---|
3894 | */
|
---|
3895 |
|
---|
3896 | bool QRegExp::isEmpty() const
|
---|
3897 | {
|
---|
3898 | return priv->engineKey.pattern.isEmpty();
|
---|
3899 | }
|
---|
3900 |
|
---|
3901 | /*!
|
---|
3902 | Returns true if the regular expression is valid; otherwise returns
|
---|
3903 | false. An invalid regular expression never matches.
|
---|
3904 |
|
---|
3905 | The pattern \bold{[a-z} is an example of an invalid pattern, since
|
---|
3906 | it lacks a closing square bracket.
|
---|
3907 |
|
---|
3908 | Note that the validity of a regexp may also depend on the setting
|
---|
3909 | of the wildcard flag, for example \bold{*.html} is a valid
|
---|
3910 | wildcard regexp but an invalid full regexp.
|
---|
3911 |
|
---|
3912 | \sa errorString()
|
---|
3913 | */
|
---|
3914 | bool QRegExp::isValid() const
|
---|
3915 | {
|
---|
3916 | if (priv->engineKey.pattern.isEmpty()) {
|
---|
3917 | return true;
|
---|
3918 | } else {
|
---|
3919 | prepareEngine(priv);
|
---|
3920 | return priv->eng->isValid();
|
---|
3921 | }
|
---|
3922 | }
|
---|
3923 |
|
---|
3924 | /*!
|
---|
3925 | Returns the pattern string of the regular expression. The pattern
|
---|
3926 | has either regular expression syntax or wildcard syntax, depending
|
---|
3927 | on patternSyntax().
|
---|
3928 |
|
---|
3929 | \sa patternSyntax(), caseSensitivity()
|
---|
3930 | */
|
---|
3931 | QString QRegExp::pattern() const
|
---|
3932 | {
|
---|
3933 | return priv->engineKey.pattern;
|
---|
3934 | }
|
---|
3935 |
|
---|
3936 | /*!
|
---|
3937 | Sets the pattern string to \a pattern. The case sensitivity,
|
---|
3938 | wildcard, and minimal matching options are not changed.
|
---|
3939 |
|
---|
3940 | \sa setPatternSyntax(), setCaseSensitivity()
|
---|
3941 | */
|
---|
3942 | void QRegExp::setPattern(const QString &pattern)
|
---|
3943 | {
|
---|
3944 | if (priv->engineKey.pattern != pattern) {
|
---|
3945 | invalidateEngine(priv);
|
---|
3946 | priv->engineKey.pattern = pattern;
|
---|
3947 | }
|
---|
3948 | }
|
---|
3949 |
|
---|
3950 | /*!
|
---|
3951 | Returns Qt::CaseSensitive if the regexp is matched case
|
---|
3952 | sensitively; otherwise returns Qt::CaseInsensitive.
|
---|
3953 |
|
---|
3954 | \sa patternSyntax(), pattern(), isMinimal()
|
---|
3955 | */
|
---|
3956 | Qt::CaseSensitivity QRegExp::caseSensitivity() const
|
---|
3957 | {
|
---|
3958 | return priv->engineKey.cs;
|
---|
3959 | }
|
---|
3960 |
|
---|
3961 | /*!
|
---|
3962 | Sets case sensitive matching to \a cs.
|
---|
3963 |
|
---|
3964 | If \a cs is Qt::CaseSensitive, \bold{\\.txt$} matches
|
---|
3965 | \c{readme.txt} but not \c{README.TXT}.
|
---|
3966 |
|
---|
3967 | \sa setPatternSyntax(), setPattern(), setMinimal()
|
---|
3968 | */
|
---|
3969 | void QRegExp::setCaseSensitivity(Qt::CaseSensitivity cs)
|
---|
3970 | {
|
---|
3971 | if ((bool)cs != (bool)priv->engineKey.cs) {
|
---|
3972 | invalidateEngine(priv);
|
---|
3973 | priv->engineKey.cs = cs;
|
---|
3974 | }
|
---|
3975 | }
|
---|
3976 |
|
---|
3977 | /*!
|
---|
3978 | Returns the syntax used by the regular expression. The default is
|
---|
3979 | QRegExp::RegExp.
|
---|
3980 |
|
---|
3981 | \sa pattern(), caseSensitivity()
|
---|
3982 | */
|
---|
3983 | QRegExp::PatternSyntax QRegExp::patternSyntax() const
|
---|
3984 | {
|
---|
3985 | return priv->engineKey.patternSyntax;
|
---|
3986 | }
|
---|
3987 |
|
---|
3988 | /*!
|
---|
3989 | Sets the syntax mode for the regular expression. The default is
|
---|
3990 | QRegExp::RegExp.
|
---|
3991 |
|
---|
3992 | Setting \a syntax to QRegExp::Wildcard enables simple shell-like
|
---|
3993 | \l{wildcard matching}. For example, \bold{r*.txt} matches the
|
---|
3994 | string \c{readme.txt} in wildcard mode, but does not match
|
---|
3995 | \c{readme}.
|
---|
3996 |
|
---|
3997 | Setting \a syntax to QRegExp::FixedString means that the pattern
|
---|
3998 | is interpreted as a plain string. Special characters (e.g.,
|
---|
3999 | backslash) don't need to be escaped then.
|
---|
4000 |
|
---|
4001 | \sa setPattern(), setCaseSensitivity(), escape()
|
---|
4002 | */
|
---|
4003 | void QRegExp::setPatternSyntax(PatternSyntax syntax)
|
---|
4004 | {
|
---|
4005 | if (syntax != priv->engineKey.patternSyntax) {
|
---|
4006 | invalidateEngine(priv);
|
---|
4007 | priv->engineKey.patternSyntax = syntax;
|
---|
4008 | }
|
---|
4009 | }
|
---|
4010 |
|
---|
4011 | /*!
|
---|
4012 | Returns true if minimal (non-greedy) matching is enabled;
|
---|
4013 | otherwise returns false.
|
---|
4014 |
|
---|
4015 | \sa caseSensitivity(), setMinimal()
|
---|
4016 | */
|
---|
4017 | bool QRegExp::isMinimal() const
|
---|
4018 | {
|
---|
4019 | return priv->minimal;
|
---|
4020 | }
|
---|
4021 |
|
---|
4022 | /*!
|
---|
4023 | Enables or disables minimal matching. If \a minimal is false,
|
---|
4024 | matching is greedy (maximal) which is the default.
|
---|
4025 |
|
---|
4026 | For example, suppose we have the input string "We must be
|
---|
4027 | <b>bold</b>, very <b>bold</b>!" and the pattern
|
---|
4028 | \bold{<b>.*</b>}. With the default greedy (maximal) matching,
|
---|
4029 | the match is "We must be \underline{<b>bold</b>, very
|
---|
4030 | <b>bold</b>}!". But with minimal (non-greedy) matching, the
|
---|
4031 | first match is: "We must be \underline{<b>bold</b>}, very
|
---|
4032 | <b>bold</b>!" and the second match is "We must be <b>bold</b>,
|
---|
4033 | very \underline{<b>bold</b>}!". In practice we might use the pattern
|
---|
4034 | \bold{<b>[^<]*\</b>} instead, although this will still fail for
|
---|
4035 | nested tags.
|
---|
4036 |
|
---|
4037 | \sa setCaseSensitivity()
|
---|
4038 | */
|
---|
4039 | void QRegExp::setMinimal(bool minimal)
|
---|
4040 | {
|
---|
4041 | priv->minimal = minimal;
|
---|
4042 | }
|
---|
4043 |
|
---|
4044 | // ### Qt 5: make non-const
|
---|
4045 | /*!
|
---|
4046 | Returns true if \a str is matched exactly by this regular
|
---|
4047 | expression; otherwise returns false. You can determine how much of
|
---|
4048 | the string was matched by calling matchedLength().
|
---|
4049 |
|
---|
4050 | For a given regexp string R, exactMatch("R") is the equivalent of
|
---|
4051 | indexIn("^R$") since exactMatch() effectively encloses the regexp
|
---|
4052 | in the start of string and end of string anchors, except that it
|
---|
4053 | sets matchedLength() differently.
|
---|
4054 |
|
---|
4055 | For example, if the regular expression is \bold{blue}, then
|
---|
4056 | exactMatch() returns true only for input \c blue. For inputs \c
|
---|
4057 | bluebell, \c blutak and \c lightblue, exactMatch() returns false
|
---|
4058 | and matchedLength() will return 4, 3 and 0 respectively.
|
---|
4059 |
|
---|
4060 | Although const, this function sets matchedLength(),
|
---|
4061 | capturedTexts(), and pos().
|
---|
4062 |
|
---|
4063 | \sa indexIn(), lastIndexIn()
|
---|
4064 | */
|
---|
4065 | bool QRegExp::exactMatch(const QString &str) const
|
---|
4066 | {
|
---|
4067 | prepareEngineForMatch(priv, str);
|
---|
4068 | priv->matchState.match(str.unicode(), str.length(), 0, priv->minimal, true, 0);
|
---|
4069 | if (priv->matchState.captured[1] == str.length()) {
|
---|
4070 | return true;
|
---|
4071 | } else {
|
---|
4072 | priv->matchState.captured[0] = 0;
|
---|
4073 | priv->matchState.captured[1] = priv->matchState.oneTestMatchedLen;
|
---|
4074 | return false;
|
---|
4075 | }
|
---|
4076 | }
|
---|
4077 |
|
---|
4078 | // ### Qt 5: make non-const
|
---|
4079 | /*!
|
---|
4080 | Attempts to find a match in \a str from position \a offset (0 by
|
---|
4081 | default). If \a offset is -1, the search starts at the last
|
---|
4082 | character; if -2, at the next to last character; etc.
|
---|
4083 |
|
---|
4084 | Returns the position of the first match, or -1 if there was no
|
---|
4085 | match.
|
---|
4086 |
|
---|
4087 | The \a caretMode parameter can be used to instruct whether \bold{^}
|
---|
4088 | should match at index 0 or at \a offset.
|
---|
4089 |
|
---|
4090 | You might prefer to use QString::indexOf(), QString::contains(),
|
---|
4091 | or even QStringList::filter(). To replace matches use
|
---|
4092 | QString::replace().
|
---|
4093 |
|
---|
4094 | Example:
|
---|
4095 | \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 13
|
---|
4096 |
|
---|
4097 | Although const, this function sets matchedLength(),
|
---|
4098 | capturedTexts() and pos().
|
---|
4099 |
|
---|
4100 | If the QRegExp is a wildcard expression (see setPatternSyntax())
|
---|
4101 | and want to test a string against the whole wildcard expression,
|
---|
4102 | use exactMatch() instead of this function.
|
---|
4103 |
|
---|
4104 | \sa lastIndexIn(), exactMatch()
|
---|
4105 | */
|
---|
4106 |
|
---|
4107 | int QRegExp::indexIn(const QString &str, int offset, CaretMode caretMode) const
|
---|
4108 | {
|
---|
4109 | prepareEngineForMatch(priv, str);
|
---|
4110 | if (offset < 0)
|
---|
4111 | offset += str.length();
|
---|
4112 | priv->matchState.match(str.unicode(), str.length(), offset,
|
---|
4113 | priv->minimal, false, caretIndex(offset, caretMode));
|
---|
4114 | return priv->matchState.captured[0];
|
---|
4115 | }
|
---|
4116 |
|
---|
4117 | // ### Qt 5: make non-const
|
---|
4118 | /*!
|
---|
4119 | Attempts to find a match backwards in \a str from position \a
|
---|
4120 | offset. If \a offset is -1 (the default), the search starts at the
|
---|
4121 | last character; if -2, at the next to last character; etc.
|
---|
4122 |
|
---|
4123 | Returns the position of the first match, or -1 if there was no
|
---|
4124 | match.
|
---|
4125 |
|
---|
4126 | The \a caretMode parameter can be used to instruct whether \bold{^}
|
---|
4127 | should match at index 0 or at \a offset.
|
---|
4128 |
|
---|
4129 | Although const, this function sets matchedLength(),
|
---|
4130 | capturedTexts() and pos().
|
---|
4131 |
|
---|
4132 | \warning Searching backwards is much slower than searching
|
---|
4133 | forwards.
|
---|
4134 |
|
---|
4135 | \sa indexIn(), exactMatch()
|
---|
4136 | */
|
---|
4137 |
|
---|
4138 | int QRegExp::lastIndexIn(const QString &str, int offset, CaretMode caretMode) const
|
---|
4139 | {
|
---|
4140 | prepareEngineForMatch(priv, str);
|
---|
4141 | if (offset < 0)
|
---|
4142 | offset += str.length();
|
---|
4143 | if (offset < 0 || offset > str.length()) {
|
---|
4144 | memset(priv->matchState.captured, -1, priv->matchState.capturedSize*sizeof(int));
|
---|
4145 | return -1;
|
---|
4146 | }
|
---|
4147 |
|
---|
4148 | while (offset >= 0) {
|
---|
4149 | priv->matchState.match(str.unicode(), str.length(), offset,
|
---|
4150 | priv->minimal, true, caretIndex(offset, caretMode));
|
---|
4151 | if (priv->matchState.captured[0] == offset)
|
---|
4152 | return offset;
|
---|
4153 | --offset;
|
---|
4154 | }
|
---|
4155 | return -1;
|
---|
4156 | }
|
---|
4157 |
|
---|
4158 | /*!
|
---|
4159 | Returns the length of the last matched string, or -1 if there was
|
---|
4160 | no match.
|
---|
4161 |
|
---|
4162 | \sa exactMatch(), indexIn(), lastIndexIn()
|
---|
4163 | */
|
---|
4164 | int QRegExp::matchedLength() const
|
---|
4165 | {
|
---|
4166 | return priv->matchState.captured[1];
|
---|
4167 | }
|
---|
4168 |
|
---|
4169 | #ifndef QT_NO_REGEXP_CAPTURE
|
---|
4170 | /*!
|
---|
4171 | \obsolete
|
---|
4172 | Returns the number of captures contained in the regular expression.
|
---|
4173 |
|
---|
4174 | \sa captureCount()
|
---|
4175 | */
|
---|
4176 | int QRegExp::numCaptures() const
|
---|
4177 | {
|
---|
4178 | return captureCount();
|
---|
4179 | }
|
---|
4180 |
|
---|
4181 | /*!
|
---|
4182 | \since 4.6
|
---|
4183 | Returns the number of captures contained in the regular expression.
|
---|
4184 | */
|
---|
4185 | int QRegExp::captureCount() const
|
---|
4186 | {
|
---|
4187 | prepareEngine(priv);
|
---|
4188 | return priv->eng->captureCount();
|
---|
4189 | }
|
---|
4190 |
|
---|
4191 | /*!
|
---|
4192 | Returns a list of the captured text strings.
|
---|
4193 |
|
---|
4194 | The first string in the list is the entire matched string. Each
|
---|
4195 | subsequent list element contains a string that matched a
|
---|
4196 | (capturing) subexpression of the regexp.
|
---|
4197 |
|
---|
4198 | For example:
|
---|
4199 | \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 14
|
---|
4200 |
|
---|
4201 | The above example also captures elements that may be present but
|
---|
4202 | which we have no interest in. This problem can be solved by using
|
---|
4203 | non-capturing parentheses:
|
---|
4204 |
|
---|
4205 | \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 15
|
---|
4206 |
|
---|
4207 | Note that if you want to iterate over the list, you should iterate
|
---|
4208 | over a copy, e.g.
|
---|
4209 | \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 16
|
---|
4210 |
|
---|
4211 | Some regexps can match an indeterminate number of times. For
|
---|
4212 | example if the input string is "Offsets: 12 14 99 231 7" and the
|
---|
4213 | regexp, \c{rx}, is \bold{(\\d+)+}, we would hope to get a list of
|
---|
4214 | all the numbers matched. However, after calling
|
---|
4215 | \c{rx.indexIn(str)}, capturedTexts() will return the list ("12",
|
---|
4216 | "12"), i.e. the entire match was "12" and the first subexpression
|
---|
4217 | matched was "12". The correct approach is to use cap() in a
|
---|
4218 | \l{QRegExp#cap_in_a_loop}{loop}.
|
---|
4219 |
|
---|
4220 | The order of elements in the string list is as follows. The first
|
---|
4221 | element is the entire matching string. Each subsequent element
|
---|
4222 | corresponds to the next capturing open left parentheses. Thus
|
---|
4223 | capturedTexts()[1] is the text of the first capturing parentheses,
|
---|
4224 | capturedTexts()[2] is the text of the second and so on
|
---|
4225 | (corresponding to $1, $2, etc., in some other regexp languages).
|
---|
4226 |
|
---|
4227 | \sa cap(), pos()
|
---|
4228 | */
|
---|
4229 | QStringList QRegExp::capturedTexts() const
|
---|
4230 | {
|
---|
4231 | if (priv->capturedCache.isEmpty()) {
|
---|
4232 | prepareEngine(priv);
|
---|
4233 | const int *captured = priv->matchState.captured;
|
---|
4234 | int n = priv->matchState.capturedSize;
|
---|
4235 |
|
---|
4236 | for (int i = 0; i < n; i += 2) {
|
---|
4237 | QString m;
|
---|
4238 | if (captured[i + 1] == 0)
|
---|
4239 | m = QLatin1String(""); // ### Qt 5: don't distinguish between null and empty
|
---|
4240 | else if (captured[i] >= 0)
|
---|
4241 | m = priv->t.mid(captured[i], captured[i + 1]);
|
---|
4242 | priv->capturedCache.append(m);
|
---|
4243 | }
|
---|
4244 | priv->t.clear();
|
---|
4245 | }
|
---|
4246 | return priv->capturedCache;
|
---|
4247 | }
|
---|
4248 |
|
---|
4249 | /*!
|
---|
4250 | \internal
|
---|
4251 | */
|
---|
4252 | QStringList QRegExp::capturedTexts()
|
---|
4253 | {
|
---|
4254 | return const_cast<const QRegExp *>(this)->capturedTexts();
|
---|
4255 | }
|
---|
4256 |
|
---|
4257 | /*!
|
---|
4258 | Returns the text captured by the \a nth subexpression. The entire
|
---|
4259 | match has index 0 and the parenthesized subexpressions have
|
---|
4260 | indexes starting from 1 (excluding non-capturing parentheses).
|
---|
4261 |
|
---|
4262 | \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 17
|
---|
4263 |
|
---|
4264 | The order of elements matched by cap() is as follows. The first
|
---|
4265 | element, cap(0), is the entire matching string. Each subsequent
|
---|
4266 | element corresponds to the next capturing open left parentheses.
|
---|
4267 | Thus cap(1) is the text of the first capturing parentheses, cap(2)
|
---|
4268 | is the text of the second, and so on.
|
---|
4269 |
|
---|
4270 | \sa capturedTexts(), pos()
|
---|
4271 | */
|
---|
4272 | QString QRegExp::cap(int nth) const
|
---|
4273 | {
|
---|
4274 | return capturedTexts().value(nth);
|
---|
4275 | }
|
---|
4276 |
|
---|
4277 | /*!
|
---|
4278 | \internal
|
---|
4279 | */
|
---|
4280 | QString QRegExp::cap(int nth)
|
---|
4281 | {
|
---|
4282 | return const_cast<const QRegExp *>(this)->cap(nth);
|
---|
4283 | }
|
---|
4284 |
|
---|
4285 | /*!
|
---|
4286 | Returns the position of the \a nth captured text in the searched
|
---|
4287 | string. If \a nth is 0 (the default), pos() returns the position
|
---|
4288 | of the whole match.
|
---|
4289 |
|
---|
4290 | Example:
|
---|
4291 | \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 18
|
---|
4292 |
|
---|
4293 | For zero-length matches, pos() always returns -1. (For example, if
|
---|
4294 | cap(4) would return an empty string, pos(4) returns -1.) This is
|
---|
4295 | a feature of the implementation.
|
---|
4296 |
|
---|
4297 | \sa cap(), capturedTexts()
|
---|
4298 | */
|
---|
4299 | int QRegExp::pos(int nth) const
|
---|
4300 | {
|
---|
4301 | if (nth < 0 || nth >= priv->matchState.capturedSize / 2)
|
---|
4302 | return -1;
|
---|
4303 | else
|
---|
4304 | return priv->matchState.captured[2 * nth];
|
---|
4305 | }
|
---|
4306 |
|
---|
4307 | /*!
|
---|
4308 | \internal
|
---|
4309 | */
|
---|
4310 | int QRegExp::pos(int nth)
|
---|
4311 | {
|
---|
4312 | return const_cast<const QRegExp *>(this)->pos(nth);
|
---|
4313 | }
|
---|
4314 |
|
---|
4315 | /*!
|
---|
4316 | Returns a text string that explains why a regexp pattern is
|
---|
4317 | invalid the case being; otherwise returns "no error occurred".
|
---|
4318 |
|
---|
4319 | \sa isValid()
|
---|
4320 | */
|
---|
4321 | QString QRegExp::errorString() const
|
---|
4322 | {
|
---|
4323 | if (isValid()) {
|
---|
4324 | return QString::fromLatin1(RXERR_OK);
|
---|
4325 | } else {
|
---|
4326 | return priv->eng->errorString();
|
---|
4327 | }
|
---|
4328 | }
|
---|
4329 |
|
---|
4330 | /*!
|
---|
4331 | \internal
|
---|
4332 | */
|
---|
4333 | QString QRegExp::errorString()
|
---|
4334 | {
|
---|
4335 | return const_cast<const QRegExp *>(this)->errorString();
|
---|
4336 | }
|
---|
4337 | #endif
|
---|
4338 |
|
---|
4339 | /*!
|
---|
4340 | Returns the string \a str with every regexp special character
|
---|
4341 | escaped with a backslash. The special characters are $, (,), *, +,
|
---|
4342 | ., ?, [, \,], ^, {, | and }.
|
---|
4343 |
|
---|
4344 | Example:
|
---|
4345 |
|
---|
4346 | \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 19
|
---|
4347 |
|
---|
4348 | This function is useful to construct regexp patterns dynamically:
|
---|
4349 |
|
---|
4350 | \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 20
|
---|
4351 |
|
---|
4352 | \sa setPatternSyntax()
|
---|
4353 | */
|
---|
4354 | QString QRegExp::escape(const QString &str)
|
---|
4355 | {
|
---|
4356 | QString quoted;
|
---|
4357 | const int count = str.count();
|
---|
4358 | quoted.reserve(count * 2);
|
---|
4359 | const QLatin1Char backslash('\\');
|
---|
4360 | for (int i = 0; i < count; i++) {
|
---|
4361 | switch (str.at(i).toLatin1()) {
|
---|
4362 | case '$':
|
---|
4363 | case '(':
|
---|
4364 | case ')':
|
---|
4365 | case '*':
|
---|
4366 | case '+':
|
---|
4367 | case '.':
|
---|
4368 | case '?':
|
---|
4369 | case '[':
|
---|
4370 | case '\\':
|
---|
4371 | case ']':
|
---|
4372 | case '^':
|
---|
4373 | case '{':
|
---|
4374 | case '|':
|
---|
4375 | case '}':
|
---|
4376 | quoted.append(backslash);
|
---|
4377 | }
|
---|
4378 | quoted.append(str.at(i));
|
---|
4379 | }
|
---|
4380 | return quoted;
|
---|
4381 | }
|
---|
4382 |
|
---|
4383 | /*!
|
---|
4384 | \fn bool QRegExp::caseSensitive() const
|
---|
4385 |
|
---|
4386 | Use \l caseSensitivity() instead.
|
---|
4387 | */
|
---|
4388 |
|
---|
4389 | /*!
|
---|
4390 | \fn void QRegExp::setCaseSensitive(bool sensitive)
|
---|
4391 |
|
---|
4392 | Use \l setCaseSensitivity() instead.
|
---|
4393 | */
|
---|
4394 |
|
---|
4395 | /*!
|
---|
4396 | \fn bool QRegExp::wildcard() const
|
---|
4397 |
|
---|
4398 | Use \l patternSyntax() instead.
|
---|
4399 |
|
---|
4400 | \oldcode
|
---|
4401 | bool wc = rx.wildcard();
|
---|
4402 | \newcode
|
---|
4403 | bool wc = (rx.patternSyntax() == QRegExp::Wildcard);
|
---|
4404 | \endcode
|
---|
4405 | */
|
---|
4406 |
|
---|
4407 | /*!
|
---|
4408 | \fn void QRegExp::setWildcard(bool wildcard)
|
---|
4409 |
|
---|
4410 | Use \l setPatternSyntax() instead.
|
---|
4411 |
|
---|
4412 | \oldcode
|
---|
4413 | rx.setWildcard(wc);
|
---|
4414 | \newcode
|
---|
4415 | rx.setPatternSyntax(wc ? QRegExp::Wildcard : QRegExp::RegExp);
|
---|
4416 | \endcode
|
---|
4417 | */
|
---|
4418 |
|
---|
4419 | /*!
|
---|
4420 | \fn bool QRegExp::minimal() const
|
---|
4421 |
|
---|
4422 | Use \l isMinimal() instead.
|
---|
4423 | */
|
---|
4424 |
|
---|
4425 | /*!
|
---|
4426 | \fn int QRegExp::search(const QString &str, int from = 0,
|
---|
4427 | CaretMode caretMode = CaretAtZero) const
|
---|
4428 |
|
---|
4429 | Use \l indexIn() instead.
|
---|
4430 | */
|
---|
4431 |
|
---|
4432 | /*!
|
---|
4433 | \fn int QRegExp::searchRev(const QString &str, int from = -1, \
|
---|
4434 | CaretMode caretMode = CaretAtZero) const
|
---|
4435 |
|
---|
4436 | Use \l lastIndexIn() instead.
|
---|
4437 | */
|
---|
4438 |
|
---|
4439 | /*!
|
---|
4440 | \fn QRegExp::QRegExp(const QString &pattern, bool cs, bool wildcard = false)
|
---|
4441 |
|
---|
4442 | Use another constructor instead.
|
---|
4443 |
|
---|
4444 | \oldcode
|
---|
4445 | QRegExp rx("*.txt", false, true);
|
---|
4446 | \newcode
|
---|
4447 | QRegExp rx("*.txt", Qt::CaseInsensitive, QRegExp::Wildcard);
|
---|
4448 | \endcode
|
---|
4449 | */
|
---|
4450 |
|
---|
4451 | #ifndef QT_NO_DATASTREAM
|
---|
4452 | /*!
|
---|
4453 | \relates QRegExp
|
---|
4454 |
|
---|
4455 | Writes the regular expression \a regExp to stream \a out.
|
---|
4456 |
|
---|
4457 | \sa {Format of the QDataStream Operators}
|
---|
4458 | */
|
---|
4459 | QDataStream &operator<<(QDataStream &out, const QRegExp ®Exp)
|
---|
4460 | {
|
---|
4461 | return out << regExp.pattern() << (quint8)regExp.caseSensitivity()
|
---|
4462 | << (quint8)regExp.patternSyntax()
|
---|
4463 | << (quint8)!!regExp.isMinimal();
|
---|
4464 | }
|
---|
4465 |
|
---|
4466 | /*!
|
---|
4467 | \relates QRegExp
|
---|
4468 |
|
---|
4469 | Reads a regular expression from stream \a in into \a regExp.
|
---|
4470 |
|
---|
4471 | \sa {Format of the QDataStream Operators}
|
---|
4472 | */
|
---|
4473 | QDataStream &operator>>(QDataStream &in, QRegExp ®Exp)
|
---|
4474 | {
|
---|
4475 | QString pattern;
|
---|
4476 | quint8 cs;
|
---|
4477 | quint8 patternSyntax;
|
---|
4478 | quint8 isMinimal;
|
---|
4479 |
|
---|
4480 | in >> pattern >> cs >> patternSyntax >> isMinimal;
|
---|
4481 |
|
---|
4482 | QRegExp newRegExp(pattern, Qt::CaseSensitivity(cs),
|
---|
4483 | QRegExp::PatternSyntax(patternSyntax));
|
---|
4484 |
|
---|
4485 | newRegExp.setMinimal(isMinimal);
|
---|
4486 | regExp = newRegExp;
|
---|
4487 | return in;
|
---|
4488 | }
|
---|
4489 | #endif // QT_NO_DATASTREAM
|
---|
4490 |
|
---|
4491 | QT_END_NAMESPACE
|
---|