Context Navigation

source: trunk/src/corelib/tools/qregexp.cpp@ 570

Last change on this file since 570 was 561, checked in by Dmitry A. Kuminov, 15 years ago
trunk: Merged in qt 4.6.1 sources.
File size: 150.6 KB

Line
1	/****************************************************************************
2	**
3	** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
4	** All rights reserved.
5	** Contact: Nokia Corporation ([email protected])
6	**
7	** This file is part of the QtCore module of the Qt Toolkit.
8	**
9	** $QT_BEGIN_LICENSE:LGPL$
10	** Commercial Usage
11	** Licensees holding valid Qt Commercial licenses may use this file in
12	** accordance with the Qt Commercial License Agreement provided with the
13	** Software or, alternatively, in accordance with the terms contained in
14	** a written agreement between you and Nokia.
15	**
16	** GNU Lesser General Public License Usage
17	** Alternatively, this file may be used under the terms of the GNU Lesser
18	** General Public License version 2.1 as published by the Free Software
19	** Foundation and appearing in the file LICENSE.LGPL included in the
20	** packaging of this file. Please review the following information to
21	** ensure the GNU Lesser General Public License version 2.1 requirements
22	** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
23	**
24	** In addition, as a special exception, Nokia gives you certain additional
25	** rights. These rights are described in the Nokia Qt LGPL Exception
26	** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
27	**
28	** GNU General Public License Usage
29	** Alternatively, this file may be used under the terms of the GNU
30	** General Public License version 3.0 as published by the Free Software
31	** Foundation and appearing in the file LICENSE.GPL included in the
32	** packaging of this file. Please review the following information to
33	** ensure the GNU General Public License version 3.0 requirements will be
34	** met: http://www.gnu.org/copyleft/gpl.html.
35	**
36	** If you have questions regarding the use of this file, please contact
37	** Nokia at [email protected].
38	** $QT_END_LICENSE$
39	**
40	****************************************************************************/
41
42	#include "qregexp.h"
43
44	#include "qalgorithms.h"
45	#include "qbitarray.h"
46	#include "qcache.h"
47	#include "qdatastream.h"
48	#include "qlist.h"
49	#include "qmap.h"
50	#include "qmutex.h"
51	#include "qstring.h"
52	#include "qstringlist.h"
53	#include "qstringmatcher.h"
54	#include "qvector.h"
55	#include "private/qfunctions_p.h"
56
57	#include <limits.h>
58
59	QT_BEGIN_NAMESPACE
60
61	int qFindString(const QChar *haystack, int haystackLen, int from,
62	const QChar *needle, int needleLen, Qt::CaseSensitivity cs);
63
64	// error strings for the regexp parser
65	#define RXERR_OK QT_TRANSLATE_NOOP("QRegExp", "no error occurred")
66	#define RXERR_DISABLED QT_TRANSLATE_NOOP("QRegExp", "disabled feature used")
67	#define RXERR_CHARCLASS QT_TRANSLATE_NOOP("QRegExp", "bad char class syntax")
68	#define RXERR_LOOKAHEAD QT_TRANSLATE_NOOP("QRegExp", "bad lookahead syntax")
69	#define RXERR_REPETITION QT_TRANSLATE_NOOP("QRegExp", "bad repetition syntax")
70	#define RXERR_OCTAL QT_TRANSLATE_NOOP("QRegExp", "invalid octal value")
71	#define RXERR_LEFTDELIM QT_TRANSLATE_NOOP("QRegExp", "missing left delim")
72	#define RXERR_END QT_TRANSLATE_NOOP("QRegExp", "unexpected end")
73	#define RXERR_LIMIT QT_TRANSLATE_NOOP("QRegExp", "met internal limit")
74	#define RXERR_INTERVAL QT_TRANSLATE_NOOP("QRegExp", "invalid interval")
75	#define RXERR_CATEGORY QT_TRANSLATE_NOOP("QRegExp", "invalid category")
76
77	/*
78	WARNING! Be sure to read qregexp.tex before modifying this file.
79	*/
80
81	/*!
82	\class QRegExp
83	\reentrant
84	\brief The QRegExp class provides pattern matching using regular expressions.
85
86	\ingroup tools
87	\ingroup shared
88
89	\keyword regular expression
90
91	A regular expression, or "regexp", is a pattern for matching
92	substrings in a text. This is useful in many contexts, e.g.,
93
94	\table
95	\row \i Validation
96	\i A regexp can test whether a substring meets some criteria,
97	e.g. is an integer or contains no whitespace.
98	\row \i Searching
99	\i A regexp provides more powerful pattern matching than
100	simple substring matching, e.g., match one of the words
101	\e{mail}, \e{letter} or \e{correspondence}, but none of the
102	words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc.
103	\row \i Search and Replace
104	\i A regexp can replace all occurrences of a substring with a
105	different substring, e.g., replace all occurrences of \e{&}
106	with \e{\&} except where the \e{&} is already followed by
107	an \e{amp;}.
108	\row \i String Splitting
109	\i A regexp can be used to identify where a string should be
110	split apart, e.g. splitting tab-delimited strings.
111	\endtable
112
113	A brief introduction to regexps is presented, a description of
114	Qt's regexp language, some examples, and the function
115	documentation itself. QRegExp is modeled on Perl's regexp
116	language. It fully supports Unicode. QRegExp can also be used in a
117	simpler, \e{wildcard mode} that is similar to the functionality
118	found in command shells. The syntax rules used by QRegExp can be
119	changed with setPatternSyntax(). In particular, the pattern syntax
120	can be set to QRegExp::FixedString, which means the pattern to be
121	matched is interpreted as a plain string, i.e., special characters
122	(e.g., backslash) are not escaped.
123
124	A good text on regexps is \e {Mastering Regular Expressions}
125	(Third Edition) by Jeffrey E. F. Friedl, ISBN 0-596-52812-4.
126
127	\tableofcontents
128
129	\section1 Introduction
130
131	Regexps are built up from expressions, quantifiers, and
132	assertions. The simplest expression is a character, e.g. \bold{x}
133	or \bold{5}. An expression can also be a set of characters
134	enclosed in square brackets. \bold{[ABCD]} will match an \bold{A}
135	or a \bold{B} or a \bold{C} or a \bold{D}. We can write this same
136	expression as \bold{[A-D]}, and an experession to match any
137	captital letter in the English alphabet is written as
138	\bold{[A-Z]}.
139
140	A quantifier specifies the number of occurrences of an expression
141	that must be matched. \bold{x{1,1}} means match one and only one
142	\bold{x}. \bold{x{1,5}} means match a sequence of \bold{x}
143	characters that contains at least one \bold{x} but no more than
144	five.
145
146	Note that in general regexps cannot be used to check for balanced
147	brackets or tags. For example, a regexp can be written to match an
148	opening html \c{<b>} and its closing \c{</b>}, if the \c{<b>} tags
149	are not nested, but if the \c{<b>} tags are nested, that same
150	regexp will match an opening \c{<b>} tag with the wrong closing
151	\c{</b>}. For the fragment \c{<b>bold <b>bolder</b></b>}, the
152	first \c{<b>} would be matched with the first \c{</b>}, which is
153	not correct. However, it is possible to write a regexp that will
154	match nested brackets or tags correctly, but only if the number of
155	nesting levels is fixed and known. If the number of nesting levels
156	is not fixed and known, it is impossible to write a regexp that
157	will not fail.
158
159	Suppose we want a regexp to match integers in the range 0 to 99.
160	At least one digit is required, so we start with the expression
161	\bold{[0-9]{1,1}}, which matches a single digit exactly once. This
162	regexp matches integers in the range 0 to 9. To match integers up
163	to 99, increase the maximum number of occurrences to 2, so the
164	regexp becomes \bold{[0-9]{1,2}}. This regexp satisfies the
165	original requirement to match integers from 0 to 99, but it will
166	also match integers that occur in the middle of strings. If we
167	want the matched integer to be the whole string, we must use the
168	anchor assertions, \bold{^} (caret) and \bold{$} (dollar). When
169	\bold{^} is the first character in a regexp, it means the regexp
170	must match from the beginning of the string. When \bold{$} is the
171	last character of the regexp, it means the regexp must match to
172	the end of the string. The regexp becomes \bold{^[0-9]{1,2}$}.
173	Note that assertions, e.g. \bold{^} and \bold{$}, do not match
174	characters but locations in the string.
175
176	If you have seen regexps described elsewhere, they may have looked
177	different from the ones shown here. This is because some sets of
178	characters and some quantifiers are so common that they have been
179	given special symbols to represent them. \bold{[0-9]} can be
180	replaced with the symbol \bold{\\d}. The quantifier to match
181	exactly one occurrence, \bold{{1,1}}, can be replaced with the
182	expression itself, i.e. \bold{x{1,1}} is the same as \bold{x}. So
183	our 0 to 99 matcher could be written as \bold{^\\d{1,2}$}. It can
184	also be written \bold{^\\d\\d{0,1}$}, i.e. \e{From the start of
185	the string, match a digit, followed immediately by 0 or 1 digits}.
186	In practice, it would be written as \bold{^\\d\\d?$}. The \bold{?}
187	is shorthand for the quantifier \bold{{0,1}}, i.e. 0 or 1
188	occurrences. \bold{?} makes an expression optional. The regexp
189	\bold{^\\d\\d?$} means \e{From the beginning of the string, match
190	one digit, followed immediately by 0 or 1 more digit, followed
191	immediately by end of string}.
192
193	To write a regexp that matches one of the words 'mail' \e or
194	'letter' \e or 'correspondence' but does not match words that
195	contain these words, e.g., 'email', 'mailman', 'mailer', and
196	'letterbox', start with a regexp that matches 'mail'. Expressed
197	fully, the regexp is \bold{m{1,1}a{1,1}i{1,1}l{1,1}}, but because
198	a character expression is automatically quantified by
199	\bold{{1,1}}, we can simplify the regexp to \bold{mail}, i.e., an
200	'm' followed by an 'a' followed by an 'i' followed by an 'l'. Now
201	we can use the vertical bar \bold{\|}, which means \bold{or}, to
202	include the other two words, so our regexp for matching any of the
203	three words becomes \bold{mail\|letter\|correspondence}. Match
204	'mail' \bold{or} 'letter' \bold{or} 'correspondence'. While this
205	regexp will match one of the three words we want to match, it will
206	also match words we don't want to match, e.g., 'email'. To
207	prevent the regexp from matching unwanted words, we must tell it
208	to begin and end the match at word boundaries. First we enclose
209	our regexp in parentheses, \bold{(mail\|letter\|correspondence)}.
210	Parentheses group expressions together, and they identify a part
211	of the regexp that we wish to \l{capturing text}{capture}.
212	Enclosing the expression in parentheses allows us to use it as a
213	component in more complex regexps. It also allows us to examine
214	which of the three words was actually matched. To force the match
215	to begin and end on word boundaries, we enclose the regexp in
216	\bold{\\b} \e{word boundary} assertions:
217	\bold{\\b(mail\|letter\|correspondence)\\b}. Now the regexp means:
218	\e{Match a word boundary, followed by the regexp in parentheses,
219	followed by a word boundary}. The \bold{\\b} assertion matches a
220	\e position in the regexp, not a \e character. A word boundary is
221	any non-word character, e.g., a space, newline, or the beginning
222	or ending of a string.
223
224	If we want to replace ampersand characters with the HTML entity
225	\bold{\&}, the regexp to match is simply \bold{\&}. But this
226	regexp will also match ampersands that have already been converted
227	to HTML entities. We want to replace only ampersands that are not
228	already followed by \bold{amp;}. For this, we need the negative
229	lookahead assertion, \bold{(?!}__\bold{)}. The regexp can then be
230	written as \bold{\&(?!amp;)}, i.e. \e{Match an ampersand that is}
231	\bold{not} \e{followed by} \bold{amp;}.
232
233	If we want to count all the occurrences of 'Eric' and 'Eirik' in a
234	string, two valid solutions are \bold{\\b(Eric\|Eirik)\\b} and
235	\bold{\\bEi?ri[ck]\\b}. The word boundary assertion '\\b' is
236	required to avoid matching words that contain either name,
237	e.g. 'Ericsson'. Note that the second regexp matches more
238	spellings than we want: 'Eric', 'Erik', 'Eiric' and 'Eirik'.
239
240	Some of the examples discussed above are implemented in the
241	\link #code-examples code examples \endlink section.
242
243	\target characters-and-abbreviations-for-sets-of-characters
244	\section1 Characters and Abbreviations for Sets of Characters
245
246	\table
247	\header \i Element \i Meaning
248	\row \i \bold{c}
249	\i A character represents itself unless it has a special
250	regexp meaning. e.g. \bold{c} matches the character \e c.
251	\row \i \bold{\\c}
252	\i A character that follows a backslash matches the character
253	itself, except as specified below. e.g., To match a literal
254	caret at the beginning of a string, write \bold{\\^}.
255	\row \i \bold{\\a}
256	\i Matches the ASCII bell (BEL, 0x07).
257	\row \i \bold{\\f}
258	\i Matches the ASCII form feed (FF, 0x0C).
259	\row \i \bold{\\n}
260	\i Matches the ASCII line feed (LF, 0x0A, Unix newline).
261	\row \i \bold{\\r}
262	\i Matches the ASCII carriage return (CR, 0x0D).
263	\row \i \bold{\\t}
264	\i Matches the ASCII horizontal tab (HT, 0x09).
265	\row \i \bold{\\v}
266	\i Matches the ASCII vertical tab (VT, 0x0B).
267	\row \i \bold{\\x\e{hhhh}}
268	\i Matches the Unicode character corresponding to the
269	hexadecimal number \e{hhhh} (between 0x0000 and 0xFFFF).
270	\row \i \bold{\\0\e{ooo}} (i.e., \\zero \e{ooo})
271	\i matches the ASCII/Latin1 character for the octal number
272	\e{ooo} (between 0 and 0377).
273	\row \i \bold{. (dot)}
274	\i Matches any character (including newline).
275	\row \i \bold{\\d}
276	\i Matches a digit (QChar::isDigit()).
277	\row \i \bold{\\D}
278	\i Matches a non-digit.
279	\row \i \bold{\\s}
280	\i Matches a whitespace character (QChar::isSpace()).
281	\row \i \bold{\\S}
282	\i Matches a non-whitespace character.
283	\row \i \bold{\\w}
284	\i Matches a word character (QChar::isLetterOrNumber(), QChar::isMark(), or '_').
285	\row \i \bold{\\W}
286	\i Matches a non-word character.
287	\row \i \bold{\\\e{n}}
288	\i The \e{n}-th \l backreference, e.g. \\1, \\2, etc.
289	\endtable
290
291	\bold{Note:} The C++ compiler transforms backslashes in strings.
292	To include a \bold{\\} in a regexp, enter it twice, i.e. \c{\\}.
293	To match the backslash character itself, enter it four times, i.e.
294	\c{\\\\}.
295
296	\target sets-of-characters
297	\section1 Sets of Characters
298
299	Square brackets mean match any character contained in the square
300	brackets. The character set abbreviations described above can
301	appear in a character set in square brackets. Except for the
302	character set abbreviations and the following two exceptions,
303	characters do not have special meanings in square brackets.
304
305	\table
306	\row \i \bold{^}
307
308	\i The caret negates the character set if it occurs as the
309	first character (i.e. immediately after the opening square
310	bracket). \bold{[abc]} matches 'a' or 'b' or 'c', but
311	\bold{[^abc]} matches anything \e but 'a' or 'b' or 'c'.
312
313	\row \i \bold{-}
314
315	\i The dash indicates a range of characters. \bold{[W-Z]}
316	matches 'W' or 'X' or 'Y' or 'Z'.
317
318	\endtable
319
320	Using the predefined character set abbreviations is more portable
321	than using character ranges across platforms and languages. For
322	example, \bold{[0-9]} matches a digit in Western alphabets but
323	\bold{\\d} matches a digit in \e any alphabet.
324
325	Note: In other regexp documentation, sets of characters are often
326	called "character classes".
327
328	\target quantifiers
329	\section1 Quantifiers
330
331	By default, an expression is automatically quantified by
332	\bold{{1,1}}, i.e. it should occur exactly once. In the following
333	list, \bold{\e {E}} stands for expression. An expression is a
334	character, or an abbreviation for a set of characters, or a set of
335	characters in square brackets, or an expression in parentheses.
336
337	\table
338	\row \i \bold{\e {E}?}
339
340	\i Matches zero or one occurrences of \e E. This quantifier
341	means \e{The previous expression is optional}, because it
342	will match whether or not the expression is found. \bold{\e
343	{E}?} is the same as \bold{\e {E}{0,1}}. e.g., \bold{dents?}
344	matches 'dent' or 'dents'.
345
346	\row \i \bold{\e {E}+}
347
348	\i Matches one or more occurrences of \e E. \bold{\e {E}+} is
349	the same as \bold{\e {E}{1,}}. e.g., \bold{0+} matches '0',
350	'00', '000', etc.
351
352	\row \i \bold{\e {E}*}
353
354	\i Matches zero or more occurrences of \e E. It is the same
355	as \bold{\e {E}{0,}}. The \bold{*} quantifier is often used
356	in error where \bold{+} should be used. For example, if
357	\bold{\\s*$} is used in an expression to match strings that
358	end in whitespace, it will match every string because
359	\bold{\\s*$} means \e{Match zero or more whitespaces followed
360	by end of string}. The correct regexp to match strings that
361	have at least one trailing whitespace character is
362	\bold{\\s+$}.
363
364	\row \i \bold{\e {E}{n}}
365
366	\i Matches exactly \e n occurrences of \e E. \bold{\e {E}{n}}
367	is the same as repeating \e E \e n times. For example,
368	\bold{x{5}} is the same as \bold{xxxxx}. It is also the same
369	as \bold{\e {E}{n,n}}, e.g. \bold{x{5,5}}.
370
371	\row \i \bold{\e {E}{n,}}
372	\i Matches at least \e n occurrences of \e E.
373
374	\row \i \bold{\e {E}{,m}}
375	\i Matches at most \e m occurrences of \e E. \bold{\e {E}{,m}}
376	is the same as \bold{\e {E}{0,m}}.
377
378	\row \i \bold{\e {E}{n,m}}
379	\i Matches at least \e n and at most \e m occurrences of \e E.
380	\endtable
381
382	To apply a quantifier to more than just the preceding character,
383	use parentheses to group characters together in an expression. For
384	example, \bold{tag+} matches a 't' followed by an 'a' followed by
385	at least one 'g', whereas \bold{(tag)+} matches at least one
386	occurrence of 'tag'.
387
388	Note: Quantifiers are normally "greedy". They always match as much
389	text as they can. For example, \bold{0+} matches the first zero it
390	finds and all the consecutive zeros after the first zero. Applied
391	to '20005', it matches'2\underline{000}5'. Quantifiers can be made
392	non-greedy, see setMinimal().
393
394	\target capturing parentheses
395	\target backreferences
396	\section1 Capturing Text
397
398	Parentheses allow us to group elements together so that we can
399	quantify and capture them. For example if we have the expression
400	\bold{mail\|letter\|correspondence} that matches a string we know
401	that \e one of the words matched but not which one. Using
402	parentheses allows us to "capture" whatever is matched within
403	their bounds, so if we used \bold{(mail\|letter\|correspondence)}
404	and matched this regexp against the string "I sent you some email"
405	we can use the cap() or capturedTexts() functions to extract the
406	matched characters, in this case 'mail'.
407
408	We can use captured text within the regexp itself. To refer to the
409	captured text we use \e backreferences which are indexed from 1,
410	the same as for cap(). For example we could search for duplicate
411	words in a string using \bold{\\b(\\w+)\\W+\\1\\b} which means match a
412	word boundary followed by one or more word characters followed by
413	one or more non-word characters followed by the same text as the
414	first parenthesized expression followed by a word boundary.
415
416	If we want to use parentheses purely for grouping and not for
417	capturing we can use the non-capturing syntax, e.g.
418	\bold{(?:green\|blue)}. Non-capturing parentheses begin '(?:' and
419	end ')'. In this example we match either 'green' or 'blue' but we
420	do not capture the match so we only know whether or not we matched
421	but not which color we actually found. Using non-capturing
422	parentheses is more efficient than using capturing parentheses
423	since the regexp engine has to do less book-keeping.
424
425	Both capturing and non-capturing parentheses may be nested.
426
427	\target greedy quantifiers
428
429	For historical reasons, quantifiers (e.g. \bold{*}) that apply to
430	capturing parentheses are more "greedy" than other quantifiers.
431	For example, \bold{a(a)} will match "aaa" with cap(1) == "aaa".
432	This behavior is different from what other regexp engines do
433	(notably, Perl). To obtain a more intuitive capturing behavior,
434	specify QRegExp::RegExp2 to the QRegExp constructor or call
435	setPatternSyntax(QRegExp::RegExp2).
436
437	\target cap_in_a_loop
438
439	When the number of matches cannot be determined in advance, a
440	common idiom is to use cap() in a loop. For example:
441
442	\snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 0
443
444	\target assertions
445	\section1 Assertions
446
447	Assertions make some statement about the text at the point where
448	they occur in the regexp but they do not match any characters. In
449	the following list \bold{\e {E}} stands for any expression.
450
451	\table
452	\row \i \bold{^}
453	\i The caret signifies the beginning of the string. If you
454	wish to match a literal \c{^} you must escape it by
455	writing \c{\\^}. For example, \bold{^#include} will only
456	match strings which \e begin with the characters '#include'.
457	(When the caret is the first character of a character set it
458	has a special meaning, see \link #sets-of-characters Sets of
459	Characters \endlink.)
460
461	\row \i \bold{$}
462	\i The dollar signifies the end of the string. For example
463	\bold{\\d\\s*$} will match strings which end with a digit
464	optionally followed by whitespace. If you wish to match a
465	literal \c{$} you must escape it by writing
466	\c{\\$}.
467
468	\row \i \bold{\\b}
469	\i A word boundary. For example the regexp
470	\bold{\\bOK\\b} means match immediately after a word
471	boundary (e.g. start of string or whitespace) the letter 'O'
472	then the letter 'K' immediately before another word boundary
473	(e.g. end of string or whitespace). But note that the
474	assertion does not actually match any whitespace so if we
475	write \bold{(\\bOK\\b)} and we have a match it will only
476	contain 'OK' even if the string is "It's \underline{OK} now".
477
478	\row \i \bold{\\B}
479	\i A non-word boundary. This assertion is true wherever
480	\bold{\\b} is false. For example if we searched for
481	\bold{\\Bon\\B} in "Left on" the match would fail (space
482	and end of string aren't non-word boundaries), but it would
483	match in "t\underline{on}ne".
484
485	\row \i \bold{(?=\e E)}
486	\i Positive lookahead. This assertion is true if the
487	expression matches at this point in the regexp. For example,
488	\bold{const(?=\\s+char)} matches 'const' whenever it is
489	followed by 'char', as in 'static \underline{const} char *'.
490	(Compare with \bold{const\\s+char}, which matches 'static
491	\underline{const char} *'.)
492
493	\row \i \bold{(?!\e E)}
494	\i Negative lookahead. This assertion is true if the
495	expression does not match at this point in the regexp. For
496	example, \bold{const(?!\\s+char)} matches 'const' \e except
497	when it is followed by 'char'.
498	\endtable
499
500	\keyword QRegExp wildcard matching
501	\section1 Wildcard Matching
502
503	Most command shells such as \e bash or \e cmd.exe support "file
504	globbing", the ability to identify a group of files by using
505	wildcards. The setPatternSyntax() function is used to switch
506	between regexp and wildcard mode. Wildcard matching is much
507	simpler than full regexps and has only four features:
508
509	\table
510	\row \i \bold{c}
511	\i Any character represents itself apart from those mentioned
512	below. Thus \bold{c} matches the character \e c.
513	\row \i \bold{?}
514	\i Matches any single character. It is the same as
515	\bold{.} in full regexps.
516	\row \i \bold{*}
517	\i Matches zero or more of any characters. It is the
518	same as \bold{.*} in full regexps.
519	\row \i \bold{[...]}
520	\i Sets of characters can be represented in square brackets,
521	similar to full regexps. Within the character class, like
522	outside, backslash has no special meaning.
523	\endtable
524
525	In the mode Wildcard, the wildcard characters cannot be
526	escaped. In the mode WildcardUnix, the character '\' escapes the
527	wildcard.
528
529	For example if we are in wildcard mode and have strings which
530	contain filenames we could identify HTML files with \bold{*.html}.
531	This will match zero or more characters followed by a dot followed
532	by 'h', 't', 'm' and 'l'.
533
534	To test a string against a wildcard expression, use exactMatch().
535	For example:
536
537	\snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 1
538
539	\target perl-users
540	\section1 Notes for Perl Users
541
542	Most of the character class abbreviations supported by Perl are
543	supported by QRegExp, see \link
544	#characters-and-abbreviations-for-sets-of-characters characters
545	and abbreviations for sets of characters \endlink.
546
547	In QRegExp, apart from within character classes, \c{^} always
548	signifies the start of the string, so carets must always be
549	escaped unless used for that purpose. In Perl the meaning of caret
550	varies automagically depending on where it occurs so escaping it
551	is rarely necessary. The same applies to \c{$} which in
552	QRegExp always signifies the end of the string.
553
554	QRegExp's quantifiers are the same as Perl's greedy quantifiers
555	(but see the \l{greedy quantifiers}{note above}). Non-greedy
556	matching cannot be applied to individual quantifiers, but can be
557	applied to all the quantifiers in the pattern. For example, to
558	match the Perl regexp \bold{ro+?m} requires:
559
560	\snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 2
561
562	The equivalent of Perl's \c{/i} option is
563	setCaseSensitivity(Qt::CaseInsensitive).
564
565	Perl's \c{/g} option can be emulated using a \l{#cap_in_a_loop}{loop}.
566
567	In QRegExp \bold{.} matches any character, therefore all QRegExp
568	regexps have the equivalent of Perl's \c{/s} option. QRegExp
569	does not have an equivalent to Perl's \c{/m} option, but this
570	can be emulated in various ways for example by splitting the input
571	into lines or by looping with a regexp that searches for newlines.
572
573	Because QRegExp is string oriented, there are no \\A, \\Z, or \\z
574	assertions. The \\G assertion is not supported but can be emulated
575	in a loop.
576
577	Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp
578	equivalents for $`, $' or $+. Perl's capturing variables, $1, $2,
579	... correspond to cap(1) or capturedTexts()[1], cap(2) or
580	capturedTexts()[2], etc.
581
582	To substitute a pattern use QString::replace().
583
584	Perl's extended \c{/x} syntax is not supported, nor are
585	directives, e.g. (?i), or regexp comments, e.g. (?#comment). On
586	the other hand, C++'s rules for literal strings can be used to
587	achieve the same:
588
589	\snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 3
590
591	Both zero-width positive and zero-width negative lookahead
592	assertions (?=pattern) and (?!pattern) are supported with the same
593	syntax as Perl. Perl's lookbehind assertions, "independent"
594	subexpressions and conditional expressions are not supported.
595
596	Non-capturing parentheses are also supported, with the same
597	(?:pattern) syntax.
598
599	See QString::split() and QStringList::join() for equivalents
600	to Perl's split and join functions.
601
602	Note: because C++ transforms \\'s they must be written \e twice in
603	code, e.g. \bold{\\b} must be written \bold{\\\\b}.
604
605	\target code-examples
606	\section1 Code Examples
607
608	\snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 4
609
610	The third string matches '\underline{6}'. This is a simple validation
611	regexp for integers in the range 0 to 99.
612
613	\snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 5
614
615	The second string matches '\underline{This_is-OK}'. We've used the
616	character set abbreviation '\\S' (non-whitespace) and the anchors
617	to match strings which contain no whitespace.
618
619	In the following example we match strings containing 'mail' or
620	'letter' or 'correspondence' but only match whole words i.e. not
621	'email'
622
623	\snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 6
624
625	The second string matches "Please write the \underline{letter}". The
626	word 'letter' is also captured (because of the parentheses). We
627	can see what text we've captured like this:
628
629	\snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 7
630
631	This will capture the text from the first set of capturing
632	parentheses (counting capturing left parentheses from left to
633	right). The parentheses are counted from 1 since cap(0) is the
634	whole matched regexp (equivalent to '&' in most regexp engines).
635
636	\snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 8
637
638	Here we've passed the QRegExp to QString's replace() function to
639	replace the matched text with new text.
640
641	\snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 9
642
643	We've used the indexIn() function to repeatedly match the regexp in
644	the string. Note that instead of moving forward by one character
645	at a time \c pos++ we could have written \c {pos +=
646	rx.matchedLength()} to skip over the already matched string. The
647	count will equal 3, matching 'One \underline{Eric} another
648	\underline{Eirik}, and an Ericsson. How many Eiriks, \underline{Eric}?'; it
649	doesn't match 'Ericsson' or 'Eiriks' because they are not bounded
650	by non-word boundaries.
651
652	One common use of regexps is to split lines of delimited data into
653	their component fields.
654
655	\snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 10
656
657	In this example our input lines have the format company name, web
658	address and country. Unfortunately the regexp is rather long and
659	not very versatile -- the code will break if we add any more
660	fields. A simpler and better solution is to look for the
661	separator, '\\t' in this case, and take the surrounding text. The
662	QString::split() function can take a separator string or regexp
663	as an argument and split a string accordingly.
664
665	\snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 11
666
667	Here field[0] is the company, field[1] the web address and so on.
668
669	To imitate the matching of a shell we can use wildcard mode.
670
671	\snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 12
672
673	Wildcard matching can be convenient because of its simplicity, but
674	any wildcard regexp can be defined using full regexps, e.g.
675	\bold{.*\.html$}. Notice that we can't match both \c .html and \c
676	.htm files with a wildcard unless we use \bold{.htm} which will
677	also match 'test.html.bak'. A full regexp gives us the precision
678	we need, \bold{.*\\.html?$}.
679
680	QRegExp can match case insensitively using setCaseSensitivity(),
681	and can use non-greedy matching, see setMinimal(). By
682	default QRegExp uses full regexps but this can be changed with
683	setWildcard(). Searching can be forward with indexIn() or backward
684	with lastIndexIn(). Captured text can be accessed using
685	capturedTexts() which returns a string list of all captured
686	strings, or using cap() which returns the captured string for the
687	given index. The pos() function takes a match index and returns
688	the position in the string where the match was made (or -1 if
689	there was no match).
690
691	\sa QString, QStringList, QRegExpValidator, QSortFilterProxyModel,
692	{tools/regexp}{Regular Expression Example}
693	*/
694
695	#if defined(Q_OS_VXWORKS) && defined(EOS)
696	# undef EOS
697	#endif
698
699	const int NumBadChars = 64;
700	#define BadChar(ch) ((ch).unicode() % NumBadChars)
701
702	const int NoOccurrence = INT_MAX;
703	const int EmptyCapture = INT_MAX;
704	const int InftyLen = INT_MAX;
705	const int InftyRep = 1025;
706	const int EOS = -1;
707
708	static bool isWord(QChar ch)
709	{
710	return ch.isLetterOrNumber() \|\| ch.isMark() \|\| ch == QLatin1Char('_');
711	}
712
713	/*
714	Merges two vectors of ints and puts the result into the first
715	one.
716	*/
717	static void mergeInto(QVector<int> *a, const QVector<int> &b)
718	{
719	int asize = a->size();
720	int bsize = b.size();
721	if (asize == 0) {
722	*a = b;
723	#ifndef QT_NO_REGEXP_OPTIM
724	} else if (bsize == 1 && a->at(asize - 1) < b.at(0)) {
725	a->resize(asize + 1);
726	(*a)[asize] = b.at(0);
727	#endif
728	} else if (bsize >= 1) {
729	int csize = asize + bsize;
730	QVector<int> c(csize);
731	int i = 0, j = 0, k = 0;
732	while (i < asize) {
733	if (j < bsize) {
734	if (a->at(i) == b.at(j)) {
735	++i;
736	--csize;
737	} else if (a->at(i) < b.at(j)) {
738	c[k++] = a->at(i++);
739	} else {
740	c[k++] = b.at(j++);
741	}
742	} else {
743	memcpy(c.data() + k, a->constData() + i, (asize - i) * sizeof(int));
744	break;
745	}
746	}
747	c.resize(csize);
748	if (j < bsize)
749	memcpy(c.data() + k, b.constData() + j, (bsize - j) * sizeof(int));
750	*a = c;
751	}
752	}
753
754	#ifndef QT_NO_REGEXP_WILDCARD
755	/*
756	Translates a wildcard pattern to an equivalent regular expression
757	pattern (e.g., .cpp to .\.cpp).
758
759	If enableEscaping is true, it is possible to escape the wildcard
760	characters with \
761	*/
762	static QString wc2rx(const QString &wc_str, const bool enableEscaping)
763	{
764	const int wclen = wc_str.length();
765	QString rx;
766	int i = 0;
767	bool isEscaping = false; // the previous character is '\'
768	const QChar *wc = wc_str.unicode();
769
770	while (i < wclen) {
771	const QChar c = wc[i++];
772	switch (c.unicode()) {
773	case '\\':
774	if (enableEscaping) {
775	if (isEscaping) {
776	rx += QLatin1String("\\\\");
777	} // we insert the \\ later if necessary
778	if (i+1 == wclen) { // the end
779	rx += QLatin1String("\\\\");
780	}
781	} else {
782	rx += QLatin1String("\\\\");
783	}
784	isEscaping = true;
785	break;
786	case '*':
787	if (isEscaping) {
788	rx += QLatin1String("\\*");
789	isEscaping = false;
790	} else {
791	rx += QLatin1String(".*");
792	}
793	break;
794	case '?':
795	if (isEscaping) {
796	rx += QLatin1String("\\?");
797	isEscaping = false;
798	} else {
799	rx += QLatin1Char('.');
800	}
801
802	break;
803	case '$':
804	case '(':
805	case ')':
806	case '+':
807	case '.':
808	case '^':
809	case '{':
810	case '\|':
811	case '}':
812	if (isEscaping) {
813	isEscaping = false;
814	rx += QLatin1String("\\\\");
815	}
816	rx += QLatin1Char('\\');
817	rx += c;
818	break;
819	case '[':
820	if (isEscaping) {
821	isEscaping = false;
822	rx += QLatin1String("\\[");
823	} else {
824	rx += c;
825	if (wc[i] == QLatin1Char('^'))
826	rx += wc[i++];
827	if (i < wclen) {
828	if (rx[i] == QLatin1Char(']'))
829	rx += wc[i++];
830	while (i < wclen && wc[i] != QLatin1Char(']')) {
831	if (wc[i] == QLatin1Char('\\'))
832	rx += QLatin1Char('\\');
833	rx += wc[i++];
834	}
835	}
836	}
837	break;
838
839	case ']':
840	if(isEscaping){
841	isEscaping = false;
842	rx += QLatin1String("\\");
843	}
844	rx += c;
845	break;
846
847	default:
848	if(isEscaping){
849	isEscaping = false;
850	rx += QLatin1String("\\\\");
851	}
852	rx += c;
853	}
854	}
855	return rx;
856	}
857	#endif
858
859	static int caretIndex(int offset, QRegExp::CaretMode caretMode)
860	{
861	if (caretMode == QRegExp::CaretAtZero) {
862	return 0;
863	} else if (caretMode == QRegExp::CaretAtOffset) {
864	return offset;
865	} else { // QRegExp::CaretWontMatch
866	return -1;
867	}
868	}
869
870	/*
871	The QRegExpEngineKey struct uniquely identifies an engine.
872	*/
873	struct QRegExpEngineKey
874	{
875	QString pattern;
876	QRegExp::PatternSyntax patternSyntax;
877	Qt::CaseSensitivity cs;
878
879	inline QRegExpEngineKey(const QString &pattern, QRegExp::PatternSyntax patternSyntax,
880	Qt::CaseSensitivity cs)
881	: pattern(pattern), patternSyntax(patternSyntax), cs(cs) {}
882
883	inline void clear() {
884	pattern.clear();
885	patternSyntax = QRegExp::RegExp;
886	cs = Qt::CaseSensitive;
887	}
888	};
889
890	Q_STATIC_GLOBAL_OPERATOR bool operator==(const QRegExpEngineKey &key1, const QRegExpEngineKey &key2)
891	{
892	return key1.pattern == key2.pattern && key1.patternSyntax == key2.patternSyntax
893	&& key1.cs == key2.cs;
894	}
895
896	class QRegExpEngine;
897
898	//Q_DECLARE_TYPEINFO(QVector<int>, Q_MOVABLE_TYPE);
899
900	/*
901	This is the engine state during matching.
902	*/
903	struct QRegExpMatchState
904	{
905	const QChar *in; // a pointer to the input string data
906	int pos; // the current position in the string
907	int caretPos;
908	int len; // the length of the input string
909	bool minimal; // minimal matching?
910	int *bigArray; // big array holding the data for the next pointers
911	int *inNextStack; // is state is nextStack?
912	int *curStack; // stack of current states
913	int *nextStack; // stack of next states
914	int *curCapBegin; // start of current states' captures
915	int *nextCapBegin; // start of next states' captures
916	int *curCapEnd; // end of current states' captures
917	int *nextCapEnd; // end of next states' captures
918	int *tempCapBegin; // start of temporary captures
919	int *tempCapEnd; // end of temporary captures
920	int *capBegin; // start of captures for a next state
921	int *capEnd; // end of captures for a next state
922	int *slideTab; // bump-along slide table for bad-character heuristic
923	int *captured; // what match() returned last
924	int slideTabSize; // size of slide table
925	int capturedSize;
926	#ifndef QT_NO_REGEXP_BACKREF
927	QList<QVector<int> > sleeping; // list of back-reference sleepers
928	#endif
929	int matchLen; // length of match
930	int oneTestMatchedLen; // length of partial match
931
932	const QRegExpEngine *eng;
933
934	inline QRegExpMatchState() : bigArray(0), captured(0) {}
935	inline ~QRegExpMatchState() { free(bigArray); }
936
937	void drain() { free(bigArray); bigArray = 0; captured = 0; } // to save memory
938	void prepareForMatch(QRegExpEngine *eng);
939	void match(const QChar *str, int len, int pos, bool minimal,
940	bool oneTest, int caretIndex);
941	bool matchHere();
942	bool testAnchor(int i, int a, const int *capBegin);
943	};
944
945	/*
946	The struct QRegExpAutomatonState represents one state in a modified NFA. The
947	input characters matched are stored in the state instead of on
948	the transitions, something possible for an automaton
949	constructed from a regular expression.
950	*/
951	struct QRegExpAutomatonState
952	{
953	#ifndef QT_NO_REGEXP_CAPTURE
954	int atom; // which atom does this state belong to?
955	#endif
956	int match; // what does it match? (see CharClassBit and BackRefBit)
957	QVector<int> outs; // out-transitions
958	QMap<int, int> reenter; // atoms reentered when transiting out
959	QMap<int, int> anchors; // anchors met when transiting out
960
961	inline QRegExpAutomatonState() { }
962	#ifndef QT_NO_REGEXP_CAPTURE
963	inline QRegExpAutomatonState(int a, int m)
964	: atom(a), match(m) { }
965	#else
966	inline QRegExpAutomatonState(int m)
967	: match(m) { }
968	#endif
969	};
970
971	Q_DECLARE_TYPEINFO(QRegExpAutomatonState, Q_MOVABLE_TYPE);
972
973	/*
974	The struct QRegExpCharClassRange represents a range of characters (e.g.,
975	[0-9] denotes range 48 to 57).
976	*/
977	struct QRegExpCharClassRange
978	{
979	ushort from; // 48
980	ushort len; // 10
981	};
982
983	Q_DECLARE_TYPEINFO(QRegExpCharClassRange, Q_PRIMITIVE_TYPE);
984
985	#ifndef QT_NO_REGEXP_CAPTURE
986	/*
987	The struct QRegExpAtom represents one node in the hierarchy of regular
988	expression atoms.
989	*/
990	struct QRegExpAtom
991	{
992	enum { NoCapture = -1, OfficialCapture = -2, UnofficialCapture = -3 };
993
994	int parent; // index of parent in array of atoms
995	int capture; // index of capture, from 1 to ncap - 1
996	};
997
998	Q_DECLARE_TYPEINFO(QRegExpAtom, Q_PRIMITIVE_TYPE);
999	#endif
1000
1001	struct QRegExpLookahead;
1002
1003	#ifndef QT_NO_REGEXP_ANCHOR_ALT
1004	/*
1005	The struct QRegExpAnchorAlternation represents a pair of anchors with
1006	OR semantics.
1007	*/
1008	struct QRegExpAnchorAlternation
1009	{
1010	int a; // this anchor...
1011	int b; // ...or this one
1012	};
1013
1014	Q_DECLARE_TYPEINFO(QRegExpAnchorAlternation, Q_PRIMITIVE_TYPE);
1015	#endif
1016
1017	#ifndef QT_NO_REGEXP_CCLASS
1018	/*
1019	The class QRegExpCharClass represents a set of characters, such as can
1020	be found in regular expressions (e.g., [a-z] denotes the set
1021	{a, b, ..., z}).
1022	*/
1023	class QRegExpCharClass
1024	{
1025	public:
1026	QRegExpCharClass();
1027	inline QRegExpCharClass(const QRegExpCharClass &cc) { operator=(cc); }
1028
1029	QRegExpCharClass &operator=(const QRegExpCharClass &cc);
1030
1031	void clear();
1032	bool negative() const { return n; }
1033	void setNegative(bool negative);
1034	void addCategories(int cats);
1035	void addRange(ushort from, ushort to);
1036	void addSingleton(ushort ch) { addRange(ch, ch); }
1037
1038	bool in(QChar ch) const;
1039	#ifndef QT_NO_REGEXP_OPTIM
1040	const QVector<int> &firstOccurrence() const { return occ1; }
1041	#endif
1042
1043	#if defined(QT_DEBUG)
1044	void dump() const;
1045	#endif
1046
1047	private:
1048	int c; // character classes
1049	QVector<QRegExpCharClassRange> r; // character ranges
1050	bool n; // negative?
1051	#ifndef QT_NO_REGEXP_OPTIM
1052	QVector<int> occ1; // first-occurrence array
1053	#endif
1054	};
1055	#else
1056	struct QRegExpCharClass
1057	{
1058	int dummy;
1059
1060	#ifndef QT_NO_REGEXP_OPTIM
1061	QRegExpCharClass() { occ1.fill(0, NumBadChars); }
1062
1063	const QVector<int> &firstOccurrence() const { return occ1; }
1064	QVector<int> occ1;
1065	#endif
1066	};
1067	#endif
1068
1069	Q_DECLARE_TYPEINFO(QRegExpCharClass, Q_MOVABLE_TYPE);
1070
1071	/*
1072	The QRegExpEngine class encapsulates a modified nondeterministic
1073	finite automaton (NFA).
1074	*/
1075	class QRegExpEngine
1076	{
1077	public:
1078	QRegExpEngine(Qt::CaseSensitivity cs, bool greedyQuantifiers)
1079	: cs(cs), greedyQuantifiers(greedyQuantifiers) { setup(); }
1080
1081	QRegExpEngine(const QRegExpEngineKey &key);
1082	~QRegExpEngine();
1083
1084	bool isValid() const { return valid; }
1085	const QString &errorString() const { return yyError; }
1086	int captureCount() const { return officialncap; }
1087
1088	int createState(QChar ch);
1089	int createState(const QRegExpCharClass &cc);
1090	#ifndef QT_NO_REGEXP_BACKREF
1091	int createState(int bref);
1092	#endif
1093
1094	void addCatTransitions(const QVector<int> &from, const QVector<int> &to);
1095	#ifndef QT_NO_REGEXP_CAPTURE
1096	void addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom);
1097	#endif
1098
1099	#ifndef QT_NO_REGEXP_ANCHOR_ALT
1100	int anchorAlternation(int a, int b);
1101	int anchorConcatenation(int a, int b);
1102	#else
1103	int anchorAlternation(int a, int b) { return a & b; }
1104	int anchorConcatenation(int a, int b) { return a \| b; }
1105	#endif
1106	void addAnchors(int from, int to, int a);
1107
1108	#ifndef QT_NO_REGEXP_OPTIM
1109	void heuristicallyChooseHeuristic();
1110	#endif
1111
1112	#if defined(QT_DEBUG)
1113	void dump() const;
1114	#endif
1115
1116	QAtomicInt ref;
1117
1118	private:
1119	enum { CharClassBit = 0x10000, BackRefBit = 0x20000 };
1120	enum { InitialState = 0, FinalState = 1 };
1121
1122	void setup();
1123	int setupState(int match);
1124
1125	/*
1126	Let's hope that 13 lookaheads and 14 back-references are
1127	enough.
1128	*/
1129	enum { MaxLookaheads = 13, MaxBackRefs = 14 };
1130	enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, Anchor_Word = 0x00000004,
1131	Anchor_NonWord = 0x00000008, Anchor_FirstLookahead = 0x00000010,
1132	Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads,
1133	Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1,
1134	Anchor_Alternation = unsigned(Anchor_BackRef1Empty) << MaxBackRefs,
1135
1136	Anchor_LookaheadMask = (Anchor_FirstLookahead - 1) ^
1137	((Anchor_FirstLookahead << MaxLookaheads) - 1) };
1138	#ifndef QT_NO_REGEXP_CAPTURE
1139	int startAtom(bool officialCapture);
1140	void finishAtom(int atom, bool needCapture);
1141	#endif
1142
1143	#ifndef QT_NO_REGEXP_LOOKAHEAD
1144	int addLookahead(QRegExpEngine *eng, bool negative);
1145	#endif
1146
1147	#ifndef QT_NO_REGEXP_OPTIM
1148	bool goodStringMatch(QRegExpMatchState &matchState) const;
1149	bool badCharMatch(QRegExpMatchState &matchState) const;
1150	#else
1151	bool bruteMatch(QRegExpMatchState &matchState) const;
1152	#endif
1153
1154	QVector<QRegExpAutomatonState> s; // array of states
1155	#ifndef QT_NO_REGEXP_CAPTURE
1156	QVector<QRegExpAtom> f; // atom hierarchy
1157	int nf; // number of atoms
1158	int cf; // current atom
1159	QVector<int> captureForOfficialCapture;
1160	#endif
1161	int officialncap; // number of captures, seen from the outside
1162	int ncap; // number of captures, seen from the inside
1163	#ifndef QT_NO_REGEXP_CCLASS
1164	QVector<QRegExpCharClass> cl; // array of character classes
1165	#endif
1166	#ifndef QT_NO_REGEXP_LOOKAHEAD
1167	QVector<QRegExpLookahead *> ahead; // array of lookaheads
1168	#endif
1169	#ifndef QT_NO_REGEXP_ANCHOR_ALT
1170	QVector<QRegExpAnchorAlternation> aa; // array of (a, b) pairs of anchors
1171	#endif
1172	#ifndef QT_NO_REGEXP_OPTIM
1173	bool caretAnchored; // does the regexp start with ^?
1174	bool trivial; // is the good-string all that needs to match?
1175	#endif
1176	bool valid; // is the regular expression valid?
1177	Qt::CaseSensitivity cs; // case sensitive?
1178	bool greedyQuantifiers; // RegExp2?
1179	bool xmlSchemaExtensions;
1180	#ifndef QT_NO_REGEXP_BACKREF
1181	int nbrefs; // number of back-references
1182	#endif
1183
1184	#ifndef QT_NO_REGEXP_OPTIM
1185	bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch
1186
1187	int goodEarlyStart; // the index where goodStr can first occur in a match
1188	int goodLateStart; // the index where goodStr can last occur in a match
1189	QString goodStr; // the string that any match has to contain
1190
1191	int minl; // the minimum length of a match
1192	QVector<int> occ1; // first-occurrence array
1193	#endif
1194
1195	/*
1196	The class Box is an abstraction for a regular expression
1197	fragment. It can also be seen as one node in the syntax tree of
1198	a regular expression with synthetized attributes.
1199
1200	Its interface is ugly for performance reasons.
1201	*/
1202	class Box
1203	{
1204	public:
1205	Box(QRegExpEngine *engine);
1206	Box(const Box &b) { operator=(b); }
1207
1208	Box &operator=(const Box &b);
1209
1210	void clear() { operator=(Box(eng)); }
1211	void set(QChar ch);
1212	void set(const QRegExpCharClass &cc);
1213	#ifndef QT_NO_REGEXP_BACKREF
1214	void set(int bref);
1215	#endif
1216
1217	void cat(const Box &b);
1218	void orx(const Box &b);
1219	void plus(int atom);
1220	void opt();
1221	void catAnchor(int a);
1222	#ifndef QT_NO_REGEXP_OPTIM
1223	void setupHeuristics();
1224	#endif
1225
1226	#if defined(QT_DEBUG)
1227	void dump() const;
1228	#endif
1229
1230	private:
1231	void addAnchorsToEngine(const Box &to) const;
1232
1233	QRegExpEngine *eng; // the automaton under construction
1234	QVector<int> ls; // the left states (firstpos)
1235	QVector<int> rs; // the right states (lastpos)
1236	QMap<int, int> lanchors; // the left anchors
1237	QMap<int, int> ranchors; // the right anchors
1238	int skipanchors; // the anchors to match if the box is skipped
1239
1240	#ifndef QT_NO_REGEXP_OPTIM
1241	int earlyStart; // the index where str can first occur
1242	int lateStart; // the index where str can last occur
1243	QString str; // a string that has to occur in any match
1244	QString leftStr; // a string occurring at the left of this box
1245	QString rightStr; // a string occurring at the right of this box
1246	int maxl; // the maximum length of this box (possibly InftyLen)
1247	#endif
1248
1249	int minl; // the minimum length of this box
1250	#ifndef QT_NO_REGEXP_OPTIM
1251	QVector<int> occ1; // first-occurrence array
1252	#endif
1253	};
1254
1255	friend class Box;
1256
1257	void setupCategoriesRangeMap();
1258
1259	/*
1260	This is the lexical analyzer for regular expressions.
1261	*/
1262	enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, Tok_PosLookahead,
1263	Tok_NegLookahead, Tok_RightParen, Tok_CharClass, Tok_Caret, Tok_Quantifier, Tok_Bar,
1264	Tok_Word, Tok_NonWord, Tok_Char = 0x10000, Tok_BackRef = 0x20000 };
1265	int getChar();
1266	int getEscape();
1267	#ifndef QT_NO_REGEXP_INTERVAL
1268	int getRep(int def);
1269	#endif
1270	#ifndef QT_NO_REGEXP_LOOKAHEAD
1271	void skipChars(int n);
1272	#endif
1273	void error(const char *msg);
1274	void startTokenizer(const QChar *rx, int len);
1275	int getToken();
1276
1277	const QChar *yyIn; // a pointer to the input regular expression pattern
1278	int yyPos0; // the position of yyTok in the input pattern
1279	int yyPos; // the position of the next character to read
1280	int yyLen; // the length of yyIn
1281	int yyCh; // the last character read
1282	QScopedPointer<QRegExpCharClass> yyCharClass; // attribute for Tok_CharClass tokens
1283	int yyMinRep; // attribute for Tok_Quantifier
1284	int yyMaxRep; // ditto
1285	QString yyError; // syntax error or overflow during parsing?
1286
1287	/*
1288	This is the syntactic analyzer for regular expressions.
1289	*/
1290	int parse(const QChar *rx, int len);
1291	void parseAtom(Box *box);
1292	void parseFactor(Box *box);
1293	void parseTerm(Box *box);
1294	void parseExpression(Box *box);
1295
1296	int yyTok; // the last token read
1297	bool yyMayCapture; // set this to false to disable capturing
1298	QHash<QByteArray, QPair<int, int> > categoriesRangeMap; // fast lookup hash for xml schema extensions
1299
1300	friend struct QRegExpMatchState;
1301	};
1302
1303	#ifndef QT_NO_REGEXP_LOOKAHEAD
1304	/*
1305	The struct QRegExpLookahead represents a lookahead a la Perl (e.g.,
1306	(?=foo) and (?!bar)).
1307	*/
1308	struct QRegExpLookahead
1309	{
1310	QRegExpEngine *eng; // NFA representing the embedded regular expression
1311	bool neg; // negative lookahead?
1312
1313	inline QRegExpLookahead(QRegExpEngine *eng0, bool neg0)
1314	: eng(eng0), neg(neg0) { }
1315	inline ~QRegExpLookahead() { delete eng; }
1316	};
1317	#endif
1318
1319	/*! \internal
1320	convert the pattern string to the RegExp syntax.
1321
1322	This is also used by QScriptEngine::newRegExp to convert to a pattern that JavaScriptCore can understan
1323	*/
1324	Q_CORE_EXPORT QString qt_regexp_toCanonical(const QString &pattern, QRegExp::PatternSyntax patternSyntax)
1325	{
1326	switch (patternSyntax) {
1327	#ifndef QT_NO_REGEXP_WILDCARD
1328	case QRegExp::Wildcard:
1329	return wc2rx(pattern, false);
1330	break;
1331	case QRegExp::WildcardUnix:
1332	return wc2rx(pattern, true);
1333	break;
1334	#endif
1335	case QRegExp::FixedString:
1336	return QRegExp::escape(pattern);
1337	break;
1338	case QRegExp::W3CXmlSchema11:
1339	default:
1340	return pattern;
1341	}
1342	}
1343
1344	QRegExpEngine::QRegExpEngine(const QRegExpEngineKey &key)
1345	: cs(key.cs), greedyQuantifiers(key.patternSyntax == QRegExp::RegExp2),
1346	xmlSchemaExtensions(key.patternSyntax == QRegExp::W3CXmlSchema11)
1347	{
1348	setup();
1349
1350	QString rx = qt_regexp_toCanonical(key.pattern, key.patternSyntax);
1351
1352	valid = (parse(rx.unicode(), rx.length()) == rx.length());
1353	if (!valid) {
1354	#ifndef QT_NO_REGEXP_OPTIM
1355	trivial = false;
1356	#endif
1357	error(RXERR_LEFTDELIM);
1358	}
1359	}
1360
1361	QRegExpEngine::~QRegExpEngine()
1362	{
1363	#ifndef QT_NO_REGEXP_LOOKAHEAD
1364	qDeleteAll(ahead);
1365	#endif
1366	}
1367
1368	void QRegExpMatchState::prepareForMatch(QRegExpEngine *eng)
1369	{
1370	/*
1371	We use one QVector<int> for all the big data used a lot in
1372	matchHere() and friends.
1373	*/
1374	int ns = eng->s.size(); // number of states
1375	int ncap = eng->ncap;
1376	#ifndef QT_NO_REGEXP_OPTIM
1377	int newSlideTabSize = qMax(eng->minl + 1, 16);
1378	#else
1379	int newSlideTabSize = 0;
1380	#endif
1381	int numCaptures = eng->captureCount();
1382	int newCapturedSize = 2 + 2 * numCaptures;
1383	bigArray = q_check_ptr((int )realloc(bigArray, ((3 + 4 ncap) * ns + 4 * ncap + newSlideTabSize + newCapturedSize)*sizeof(int)));
1384
1385	// set all internal variables only _after_ bigArray is realloc'ed
1386	// to prevent a broken regexp in oom case
1387
1388	slideTabSize = newSlideTabSize;
1389	capturedSize = newCapturedSize;
1390	inNextStack = bigArray;
1391	memset(inNextStack, -1, ns * sizeof(int));
1392	curStack = inNextStack + ns;
1393	nextStack = inNextStack + 2 * ns;
1394
1395	curCapBegin = inNextStack + 3 * ns;
1396	nextCapBegin = curCapBegin + ncap * ns;
1397	curCapEnd = curCapBegin + 2 * ncap * ns;
1398	nextCapEnd = curCapBegin + 3 * ncap * ns;
1399
1400	tempCapBegin = curCapBegin + 4 * ncap * ns;
1401	tempCapEnd = tempCapBegin + ncap;
1402	capBegin = tempCapBegin + 2 * ncap;
1403	capEnd = tempCapBegin + 3 * ncap;
1404
1405	slideTab = tempCapBegin + 4 * ncap;
1406	captured = slideTab + slideTabSize;
1407	memset(captured, -1, capturedSize*sizeof(int));
1408	this->eng = eng;
1409	}
1410
1411	/*
1412	Tries to match in str and returns an array of (begin, length) pairs
1413	for captured text. If there is no match, all pairs are (-1, -1).
1414	*/
1415	void QRegExpMatchState::match(const QChar *str0, int len0, int pos0,
1416	bool minimal0, bool oneTest, int caretIndex)
1417	{
1418	bool matched = false;
1419	QChar char_null;
1420
1421	#ifndef QT_NO_REGEXP_OPTIM
1422	if (eng->trivial && !oneTest) {
1423	pos = qFindString(str0, len0, pos0, eng->goodStr.unicode(), eng->goodStr.length(), eng->cs);
1424	matchLen = eng->goodStr.length();
1425	matched = (pos != -1);
1426	} else
1427	#endif
1428	{
1429	in = str0;
1430	if (in == 0)
1431	in = &char_null;
1432	pos = pos0;
1433	caretPos = caretIndex;
1434	len = len0;
1435	minimal = minimal0;
1436	matchLen = 0;
1437	oneTestMatchedLen = 0;
1438
1439	if (eng->valid && pos >= 0 && pos <= len) {
1440	#ifndef QT_NO_REGEXP_OPTIM
1441	if (oneTest) {
1442	matched = matchHere();
1443	} else {
1444	if (pos <= len - eng->minl) {
1445	if (eng->caretAnchored) {
1446	matched = matchHere();
1447	} else if (eng->useGoodStringHeuristic) {
1448	matched = eng->goodStringMatch(*this);
1449	} else {
1450	matched = eng->badCharMatch(*this);
1451	}
1452	}
1453	}
1454	#else
1455	matched = oneTest ? matchHere() : eng->bruteMatch(*this);
1456	#endif
1457	}
1458	}
1459
1460	if (matched) {
1461	int *c = captured;
1462	*c++ = pos;
1463	*c++ = matchLen;
1464
1465	int numCaptures = (capturedSize - 2) >> 1;
1466	#ifndef QT_NO_REGEXP_CAPTURE
1467	for (int i = 0; i < numCaptures; ++i) {
1468	int j = eng->captureForOfficialCapture.at(i);
1469	int len = capEnd[j] - capBegin[j];
1470	*c++ = (len > 0) ? pos + capBegin[j] : 0;
1471	*c++ = len;
1472	}
1473	#endif
1474	} else {
1475	// we rely on 2's complement here
1476	memset(captured, -1, capturedSize * sizeof(int));
1477	}
1478	}
1479
1480	/*
1481	The three following functions add one state to the automaton and
1482	return the number of the state.
1483	*/
1484
1485	int QRegExpEngine::createState(QChar ch)
1486	{
1487	return setupState(ch.unicode());
1488	}
1489
1490	int QRegExpEngine::createState(const QRegExpCharClass &cc)
1491	{
1492	#ifndef QT_NO_REGEXP_CCLASS
1493	int n = cl.size();
1494	cl += QRegExpCharClass(cc);
1495	return setupState(CharClassBit \| n);
1496	#else
1497	Q_UNUSED(cc);
1498	return setupState(CharClassBit);
1499	#endif
1500	}
1501
1502	#ifndef QT_NO_REGEXP_BACKREF
1503	int QRegExpEngine::createState(int bref)
1504	{
1505	if (bref > nbrefs) {
1506	nbrefs = bref;
1507	if (nbrefs > MaxBackRefs) {
1508	error(RXERR_LIMIT);
1509	return 0;
1510	}
1511	}
1512	return setupState(BackRefBit \| bref);
1513	}
1514	#endif
1515
1516	/*
1517	The two following functions add a transition between all pairs of
1518	states (i, j) where i is found in from, and j is found in to.
1519
1520	Cat-transitions are distinguished from plus-transitions for
1521	capturing.
1522	*/
1523
1524	void QRegExpEngine::addCatTransitions(const QVector<int> &from, const QVector<int> &to)
1525	{
1526	for (int i = 0; i < from.size(); i++)
1527	mergeInto(&s[from.at(i)].outs, to);
1528	}
1529
1530	#ifndef QT_NO_REGEXP_CAPTURE
1531	void QRegExpEngine::addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom)
1532	{
1533	for (int i = 0; i < from.size(); i++) {
1534	QRegExpAutomatonState &st = s[from.at(i)];
1535	const QVector<int> oldOuts = st.outs;
1536	mergeInto(&st.outs, to);
1537	if (f.at(atom).capture != QRegExpAtom::NoCapture) {
1538	for (int j = 0; j < to.size(); j++) {
1539	// ### st.reenter.contains(to.at(j)) check looks suspicious
1540	if (!st.reenter.contains(to.at(j)) &&
1541	qBinaryFind(oldOuts.constBegin(), oldOuts.constEnd(), to.at(j)) == oldOuts.end())
1542	st.reenter.insert(to.at(j), atom);
1543	}
1544	}
1545	}
1546	}
1547	#endif
1548
1549	#ifndef QT_NO_REGEXP_ANCHOR_ALT
1550	/*
1551	Returns an anchor that means a OR b.
1552	*/
1553	int QRegExpEngine::anchorAlternation(int a, int b)
1554	{
1555	if (((a & b) == a \|\| (a & b) == b) && ((a \| b) & Anchor_Alternation) == 0)
1556	return a & b;
1557
1558	int n = aa.size();
1559	#ifndef QT_NO_REGEXP_OPTIM
1560	if (n > 0 && aa.at(n - 1).a == a && aa.at(n - 1).b == b)
1561	return Anchor_Alternation \| (n - 1);
1562	#endif
1563
1564	QRegExpAnchorAlternation element = {a, b};
1565	aa.append(element);
1566	return Anchor_Alternation \| n;
1567	}
1568
1569	/*
1570	Returns an anchor that means a AND b.
1571	*/
1572	int QRegExpEngine::anchorConcatenation(int a, int b)
1573	{
1574	if (((a \| b) & Anchor_Alternation) == 0)
1575	return a \| b;
1576	if ((b & Anchor_Alternation) != 0)
1577	qSwap(a, b);
1578
1579	int aprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).a, b);
1580	int bprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).b, b);
1581	return anchorAlternation(aprime, bprime);
1582	}
1583	#endif
1584
1585	/*
1586	Adds anchor a on a transition caracterised by its from state and
1587	its to state.
1588	*/
1589	void QRegExpEngine::addAnchors(int from, int to, int a)
1590	{
1591	QRegExpAutomatonState &st = s[from];
1592	if (st.anchors.contains(to))
1593	a = anchorAlternation(st.anchors.value(to), a);
1594	st.anchors.insert(to, a);
1595	}
1596
1597	#ifndef QT_NO_REGEXP_OPTIM
1598	/*
1599	This function chooses between the good-string and the bad-character
1600	heuristics. It computes two scores and chooses the heuristic with
1601	the highest score.
1602
1603	Here are some common-sense constraints on the scores that should be
1604	respected if the formulas are ever modified: (1) If goodStr is
1605	empty, the good-string heuristic scores 0. (2) If the regular
1606	expression is trivial, the good-string heuristic should be used.
1607	(3) If the search is case insensitive, the good-string heuristic
1608	should be used, unless it scores 0. (Case insensitivity turns all
1609	entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is
1610	big, the good-string heuristic should score less.
1611	*/
1612	void QRegExpEngine::heuristicallyChooseHeuristic()
1613	{
1614	if (minl == 0) {
1615	useGoodStringHeuristic = false;
1616	} else if (trivial) {
1617	useGoodStringHeuristic = true;
1618	} else {
1619	/*
1620	Magic formula: The good string has to constitute a good
1621	proportion of the minimum-length string, and appear at a
1622	more-or-less known index.
1623	*/
1624	int goodStringScore = (64 * goodStr.length() / minl) -
1625	(goodLateStart - goodEarlyStart);
1626	/*
1627	Less magic formula: We pick some characters at random, and
1628	check whether they are good or bad.
1629	*/
1630	int badCharScore = 0;
1631	int step = qMax(1, NumBadChars / 32);
1632	for (int i = 1; i < NumBadChars; i += step) {
1633	if (occ1.at(i) == NoOccurrence)
1634	badCharScore += minl;
1635	else
1636	badCharScore += occ1.at(i);
1637	}
1638	badCharScore /= minl;
1639	useGoodStringHeuristic = (goodStringScore > badCharScore);
1640	}
1641	}
1642	#endif
1643
1644	#if defined(QT_DEBUG)
1645	void QRegExpEngine::dump() const
1646	{
1647	int i, j;
1648	qDebug("Case %ssensitive engine", cs ? "" : "in");
1649	qDebug(" States");
1650	for (i = 0; i < s.size(); i++) {
1651	qDebug(" %d%s", i, i == InitialState ? " (initial)" : i == FinalState ? " (final)" : "");
1652	#ifndef QT_NO_REGEXP_CAPTURE
1653	if (nf > 0)
1654	qDebug(" in atom %d", s[i].atom);
1655	#endif
1656	int m = s[i].match;
1657	if ((m & CharClassBit) != 0) {
1658	qDebug(" match character class %d", m ^ CharClassBit);
1659	#ifndef QT_NO_REGEXP_CCLASS
1660	cl[m ^ CharClassBit].dump();
1661	#else
1662	qDebug(" negative character class");
1663	#endif
1664	} else if ((m & BackRefBit) != 0) {
1665	qDebug(" match back-reference %d", m ^ BackRefBit);
1666	} else if (m >= 0x20 && m <= 0x7e) {
1667	qDebug(" match 0x%.4x (%c)", m, m);
1668	} else {
1669	qDebug(" match 0x%.4x", m);
1670	}
1671	for (j = 0; j < s[i].outs.size(); j++) {
1672	int next = s[i].outs[j];
1673	qDebug(" -> %d", next);
1674	if (s[i].reenter.contains(next))
1675	qDebug(" [reenter %d]", s[i].reenter[next]);
1676	if (s[i].anchors.value(next) != 0)
1677	qDebug(" [anchors 0x%.8x]", s[i].anchors[next]);
1678	}
1679	}
1680	#ifndef QT_NO_REGEXP_CAPTURE
1681	if (nf > 0) {
1682	qDebug(" Atom Parent Capture");
1683	for (i = 0; i < nf; i++) {
1684	if (f[i].capture == QRegExpAtom::NoCapture) {
1685	qDebug(" %6d %6d nil", i, f[i].parent);
1686	} else {
1687	int cap = f[i].capture;
1688	bool official = captureForOfficialCapture.contains(cap);
1689	qDebug(" %6d %6d %6d %s", i, f[i].parent, f[i].capture,
1690	official ? "official" : "");
1691	}
1692	}
1693	}
1694	#endif
1695	#ifndef QT_NO_REGEXP_ANCHOR_ALT
1696	for (i = 0; i < aa.size(); i++)
1697	qDebug(" Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a, aa[i].b);
1698	#endif
1699	}
1700	#endif
1701
1702	void QRegExpEngine::setup()
1703	{
1704	ref = 1;
1705	#ifndef QT_NO_REGEXP_CAPTURE
1706	f.resize(32);
1707	nf = 0;
1708	cf = -1;
1709	#endif
1710	officialncap = 0;
1711	ncap = 0;
1712	#ifndef QT_NO_REGEXP_OPTIM
1713	caretAnchored = true;
1714	trivial = true;
1715	#endif
1716	valid = false;
1717	#ifndef QT_NO_REGEXP_BACKREF
1718	nbrefs = 0;
1719	#endif
1720	#ifndef QT_NO_REGEXP_OPTIM
1721	useGoodStringHeuristic = true;
1722	minl = 0;
1723	occ1.fill(0, NumBadChars);
1724	#endif
1725	}
1726
1727	int QRegExpEngine::setupState(int match)
1728	{
1729	#ifndef QT_NO_REGEXP_CAPTURE
1730	s += QRegExpAutomatonState(cf, match);
1731	#else
1732	s += QRegExpAutomatonState(match);
1733	#endif
1734	return s.size() - 1;
1735	}
1736
1737	#ifndef QT_NO_REGEXP_CAPTURE
1738	/*
1739	Functions startAtom() and finishAtom() should be called to delimit
1740	atoms. When a state is created, it is assigned to the current atom.
1741	The information is later used for capturing.
1742	*/
1743	int QRegExpEngine::startAtom(bool officialCapture)
1744	{
1745	if ((nf & (nf + 1)) == 0 && nf + 1 >= f.size())
1746	f.resize((nf + 1) << 1);
1747	f[nf].parent = cf;
1748	cf = nf++;
1749	f[cf].capture = officialCapture ? QRegExpAtom::OfficialCapture : QRegExpAtom::NoCapture;
1750	return cf;
1751	}
1752
1753	void QRegExpEngine::finishAtom(int atom, bool needCapture)
1754	{
1755	if (greedyQuantifiers && needCapture && f[atom].capture == QRegExpAtom::NoCapture)
1756	f[atom].capture = QRegExpAtom::UnofficialCapture;
1757	cf = f.at(atom).parent;
1758	}
1759	#endif
1760
1761	#ifndef QT_NO_REGEXP_LOOKAHEAD
1762	/*
1763	Creates a lookahead anchor.
1764	*/
1765	int QRegExpEngine::addLookahead(QRegExpEngine *eng, bool negative)
1766	{
1767	int n = ahead.size();
1768	if (n == MaxLookaheads) {
1769	error(RXERR_LIMIT);
1770	return 0;
1771	}
1772	ahead += new QRegExpLookahead(eng, negative);
1773	return Anchor_FirstLookahead << n;
1774	}
1775	#endif
1776
1777	#ifndef QT_NO_REGEXP_CAPTURE
1778	/*
1779	We want the longest leftmost captures.
1780	*/
1781	static bool isBetterCapture(int ncap, const int begin1, const int end1, const int *begin2,
1782	const int *end2)
1783	{
1784	for (int i = 0; i < ncap; i++) {
1785	int delta = begin2[i] - begin1[i]; // it has to start early...
1786	if (delta == 0)
1787	delta = end1[i] - end2[i]; // ...and end late
1788
1789	if (delta != 0)
1790	return delta > 0;
1791	}
1792	return false;
1793	}
1794	#endif
1795
1796	/*
1797	Returns true if anchor a matches at position pos + i in the input
1798	string, otherwise false.
1799	*/
1800	bool QRegExpMatchState::testAnchor(int i, int a, const int *capBegin)
1801	{
1802	int j;
1803
1804	#ifndef QT_NO_REGEXP_ANCHOR_ALT
1805	if ((a & QRegExpEngine::Anchor_Alternation) != 0)
1806	return testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).a, capBegin)
1807	\|\| testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).b, capBegin);
1808	#endif
1809
1810	if ((a & QRegExpEngine::Anchor_Caret) != 0) {
1811	if (pos + i != caretPos)
1812	return false;
1813	}
1814	if ((a & QRegExpEngine::Anchor_Dollar) != 0) {
1815	if (pos + i != len)
1816	return false;
1817	}
1818	#ifndef QT_NO_REGEXP_ESCAPE
1819	if ((a & (QRegExpEngine::Anchor_Word \| QRegExpEngine::Anchor_NonWord)) != 0) {
1820	bool before = false;
1821	bool after = false;
1822	if (pos + i != 0)
1823	before = isWord(in[pos + i - 1]);
1824	if (pos + i != len)
1825	after = isWord(in[pos + i]);
1826	if ((a & QRegExpEngine::Anchor_Word) != 0 && (before == after))
1827	return false;
1828	if ((a & QRegExpEngine::Anchor_NonWord) != 0 && (before != after))
1829	return false;
1830	}
1831	#endif
1832	#ifndef QT_NO_REGEXP_LOOKAHEAD
1833	if ((a & QRegExpEngine::Anchor_LookaheadMask) != 0) {
1834	const QVector<QRegExpLookahead *> &ahead = eng->ahead;
1835	for (j = 0; j < ahead.size(); j++) {
1836	if ((a & (QRegExpEngine::Anchor_FirstLookahead << j)) != 0) {
1837	QRegExpMatchState matchState;
1838	matchState.prepareForMatch(ahead[j]->eng);
1839	matchState.match(in + pos + i, len - pos - i, 0,
1840	true, true, matchState.caretPos - matchState.pos - i);
1841	if ((matchState.captured[0] == 0) == ahead[j]->neg)
1842	return false;
1843	}
1844	}
1845	}
1846	#endif
1847	#ifndef QT_NO_REGEXP_CAPTURE
1848	#ifndef QT_NO_REGEXP_BACKREF
1849	for (j = 0; j < eng->nbrefs; j++) {
1850	if ((a & (QRegExpEngine::Anchor_BackRef1Empty << j)) != 0) {
1851	int i = eng->captureForOfficialCapture.at(j);
1852	if (capBegin[i] != EmptyCapture)
1853	return false;
1854	}
1855	}
1856	#endif
1857	#endif
1858	return true;
1859	}
1860
1861	#ifndef QT_NO_REGEXP_OPTIM
1862	/*
1863	The three following functions are what Jeffrey Friedl would call
1864	transmissions (or bump-alongs). Using one or the other should make
1865	no difference except in performance.
1866	*/
1867
1868	bool QRegExpEngine::goodStringMatch(QRegExpMatchState &matchState) const
1869	{
1870	int k = matchState.pos + goodEarlyStart;
1871	QStringMatcher matcher(goodStr.unicode(), goodStr.length(), cs);
1872	while ((k = matcher.indexIn(matchState.in, matchState.len, k)) != -1) {
1873	int from = k - goodLateStart;
1874	int to = k - goodEarlyStart;
1875	if (from > matchState.pos)
1876	matchState.pos = from;
1877
1878	while (matchState.pos <= to) {
1879	if (matchState.matchHere())
1880	return true;
1881	++matchState.pos;
1882	}
1883	++k;
1884	}
1885	return false;
1886	}
1887
1888	bool QRegExpEngine::badCharMatch(QRegExpMatchState &matchState) const
1889	{
1890	int slideHead = 0;
1891	int slideNext = 0;
1892	int i;
1893	int lastPos = matchState.len - minl;
1894	memset(matchState.slideTab, 0, matchState.slideTabSize * sizeof(int));
1895
1896	/*
1897	Set up the slide table, used for the bad-character heuristic,
1898	using the table of first occurrence of each character.
1899	*/
1900	for (i = 0; i < minl; i++) {
1901	int sk = occ1[BadChar(matchState.in[matchState.pos + i])];
1902	if (sk == NoOccurrence)
1903	sk = i + 1;
1904	if (sk > 0) {
1905	int k = i + 1 - sk;
1906	if (k < 0) {
1907	sk = i + 1;
1908	k = 0;
1909	}
1910	if (sk > matchState.slideTab[k])
1911	matchState.slideTab[k] = sk;
1912	}
1913	}
1914
1915	if (matchState.pos > lastPos)
1916	return false;
1917
1918	for (;;) {
1919	if (++slideNext >= matchState.slideTabSize)
1920	slideNext = 0;
1921	if (matchState.slideTab[slideHead] > 0) {
1922	if (matchState.slideTab[slideHead] - 1 > matchState.slideTab[slideNext])
1923	matchState.slideTab[slideNext] = matchState.slideTab[slideHead] - 1;
1924	matchState.slideTab[slideHead] = 0;
1925	} else {
1926	if (matchState.matchHere())
1927	return true;
1928	}
1929
1930	if (matchState.pos == lastPos)
1931	break;
1932
1933	/*
1934	Update the slide table. This code has much in common with
1935	the initialization code.
1936	*/
1937	int sk = occ1[BadChar(matchState.in[matchState.pos + minl])];
1938	if (sk == NoOccurrence) {
1939	matchState.slideTab[slideNext] = minl;
1940	} else if (sk > 0) {
1941	int k = slideNext + minl - sk;
1942	if (k >= matchState.slideTabSize)
1943	k -= matchState.slideTabSize;
1944	if (sk > matchState.slideTab[k])
1945	matchState.slideTab[k] = sk;
1946	}
1947	slideHead = slideNext;
1948	++matchState.pos;
1949	}
1950	return false;
1951	}
1952	#else
1953	bool QRegExpEngine::bruteMatch(QRegExpMatchState &matchState) const
1954	{
1955	while (matchState.pos <= matchState.len) {
1956	if (matchState.matchHere())
1957	return true;
1958	++matchState.pos;
1959	}
1960	return false;
1961	}
1962	#endif
1963
1964	/*
1965	Here's the core of the engine. It tries to do a match here and now.
1966	*/
1967	bool QRegExpMatchState::matchHere()
1968	{
1969	int ncur = 1, nnext = 0;
1970	int i = 0, j, k, m;
1971	bool stop = false;
1972
1973	matchLen = -1;
1974	oneTestMatchedLen = -1;
1975	curStack[0] = QRegExpEngine::InitialState;
1976
1977	int ncap = eng->ncap;
1978	#ifndef QT_NO_REGEXP_CAPTURE
1979	if (ncap > 0) {
1980	for (j = 0; j < ncap; j++) {
1981	curCapBegin[j] = EmptyCapture;
1982	curCapEnd[j] = EmptyCapture;
1983	}
1984	}
1985	#endif
1986
1987	#ifndef QT_NO_REGEXP_BACKREF
1988	while ((ncur > 0 \|\| !sleeping.isEmpty()) && i <= len - pos && !stop)
1989	#else
1990	while (ncur > 0 && i <= len - pos && !stop)
1991	#endif
1992	{
1993	int ch = (i < len - pos) ? in[pos + i].unicode() : 0;
1994	for (j = 0; j < ncur; j++) {
1995	int cur = curStack[j];
1996	const QRegExpAutomatonState &scur = eng->s.at(cur);
1997	const QVector<int> &outs = scur.outs;
1998	for (k = 0; k < outs.size(); k++) {
1999	int next = outs.at(k);
2000	const QRegExpAutomatonState &snext = eng->s.at(next);
2001	bool inside = true;
2002	#if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
2003	int needSomeSleep = 0;
2004	#endif
2005
2006	/*
2007	First, check if the anchors are anchored properly.
2008	*/
2009	int a = scur.anchors.value(next);
2010	if (a != 0 && !testAnchor(i, a, curCapBegin + j * ncap))
2011	inside = false;
2012
2013	/*
2014	If indeed they are, check if the input character is
2015	correct for this transition.
2016	*/
2017	if (inside) {
2018	m = snext.match;
2019	if ((m & (QRegExpEngine::CharClassBit \| QRegExpEngine::BackRefBit)) == 0) {
2020	if (eng->cs)
2021	inside = (m == ch);
2022	else
2023	inside = (QChar(m).toLower() == QChar(ch).toLower());
2024	} else if (next == QRegExpEngine::FinalState) {
2025	matchLen = i;
2026	stop = minimal;
2027	inside = true;
2028	} else if ((m & QRegExpEngine::CharClassBit) != 0) {
2029	#ifndef QT_NO_REGEXP_CCLASS
2030	const QRegExpCharClass &cc = eng->cl.at(m ^ QRegExpEngine::CharClassBit);
2031	if (eng->cs)
2032	inside = cc.in(ch);
2033	else if (cc.negative())
2034	inside = cc.in(QChar(ch).toLower()) &&
2035	cc.in(QChar(ch).toUpper());
2036	else
2037	inside = cc.in(QChar(ch).toLower()) \|\|
2038	cc.in(QChar(ch).toUpper());
2039	#endif
2040	#if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
2041	} else { /* ((m & QRegExpEngine::BackRefBit) != 0) */
2042	int bref = m ^ QRegExpEngine::BackRefBit;
2043	int ell = j * ncap + eng->captureForOfficialCapture.at(bref - 1);
2044
2045	inside = bref <= ncap && curCapBegin[ell] != EmptyCapture;
2046	if (inside) {
2047	if (eng->cs)
2048	inside = (in[pos + curCapBegin[ell]] == QChar(ch));
2049	else
2050	inside = (in[pos + curCapBegin[ell]].toLower()
2051	== QChar(ch).toLower());
2052	}
2053
2054	if (inside) {
2055	int delta;
2056	if (curCapEnd[ell] == EmptyCapture)
2057	delta = i - curCapBegin[ell];
2058	else
2059	delta = curCapEnd[ell] - curCapBegin[ell];
2060
2061	inside = (delta <= len - (pos + i));
2062	if (inside && delta > 1) {
2063	int n = 1;
2064	if (eng->cs) {
2065	while (n < delta) {
2066	if (in[pos + curCapBegin[ell] + n]
2067	!= in[pos + i + n])
2068	break;
2069	++n;
2070	}
2071	} else {
2072	while (n < delta) {
2073	QChar a = in[pos + curCapBegin[ell] + n];
2074	QChar b = in[pos + i + n];
2075	if (a.toLower() != b.toLower())
2076	break;
2077	++n;
2078	}
2079	}
2080	inside = (n == delta);
2081	if (inside)
2082	needSomeSleep = delta - 1;
2083	}
2084	}
2085	#endif
2086	}
2087	}
2088
2089	/*
2090	We must now update our data structures.
2091	*/
2092	if (inside) {
2093	#ifndef QT_NO_REGEXP_CAPTURE
2094	int capBegin, capEnd;
2095	#endif
2096	/*
2097	If the next state was not encountered yet, all
2098	is fine.
2099	*/
2100	if ((m = inNextStack[next]) == -1) {
2101	m = nnext++;
2102	nextStack[m] = next;
2103	inNextStack[next] = m;
2104	#ifndef QT_NO_REGEXP_CAPTURE
2105	capBegin = nextCapBegin + m * ncap;
2106	capEnd = nextCapEnd + m * ncap;
2107
2108	/*
2109	Otherwise, we'll first maintain captures in
2110	temporary arrays, and decide at the end whether
2111	it's best to keep the previous capture zones or
2112	the new ones.
2113	*/
2114	} else {
2115	capBegin = tempCapBegin;
2116	capEnd = tempCapEnd;
2117	#endif
2118	}
2119
2120	#ifndef QT_NO_REGEXP_CAPTURE
2121	/*
2122	Updating the capture zones is much of a task.
2123	*/
2124	if (ncap > 0) {
2125	memcpy(capBegin, curCapBegin + j * ncap, ncap * sizeof(int));
2126	memcpy(capEnd, curCapEnd + j * ncap, ncap * sizeof(int));
2127	int c = scur.atom, n = snext.atom;
2128	int p = -1, q = -1;
2129	int cap;
2130
2131	/*
2132	Lemma 1. For any x in the range [0..nf), we
2133	have f[x].parent < x.
2134
2135	Proof. By looking at startAtom(), it is
2136	clear that cf < nf holds all the time, and
2137	thus that f[nf].parent < nf.
2138	*/
2139
2140	/*
2141	If we are reentering an atom, we empty all
2142	capture zones inside it.
2143	*/
2144	if ((q = scur.reenter.value(next)) != 0) {
2145	QBitArray b(eng->nf, false);
2146	b.setBit(q, true);
2147	for (int ell = q + 1; ell < eng->nf; ell++) {
2148	if (b.testBit(eng->f.at(ell).parent)) {
2149	b.setBit(ell, true);
2150	cap = eng->f.at(ell).capture;
2151	if (cap >= 0) {
2152	capBegin[cap] = EmptyCapture;
2153	capEnd[cap] = EmptyCapture;
2154	}
2155	}
2156	}
2157	p = eng->f.at(q).parent;
2158
2159	/*
2160	Otherwise, close the capture zones we are
2161	leaving. We are leaving f[c].capture,
2162	f[f[c].parent].capture,
2163	f[f[f[c].parent].parent].capture, ...,
2164	until f[x].capture, with x such that
2165	f[x].parent is the youngest common ancestor
2166	for c and n.
2167
2168	We go up along c's and n's ancestry until
2169	we find x.
2170	*/
2171	} else {
2172	p = c;
2173	q = n;
2174	while (p != q) {
2175	if (p > q) {
2176	cap = eng->f.at(p).capture;
2177	if (cap >= 0) {
2178	if (capBegin[cap] == i) {
2179	capBegin[cap] = EmptyCapture;
2180	capEnd[cap] = EmptyCapture;
2181	} else {
2182	capEnd[cap] = i;
2183	}
2184	}
2185	p = eng->f.at(p).parent;
2186	} else {
2187	q = eng->f.at(q).parent;
2188	}
2189	}
2190	}
2191
2192	/*
2193	In any case, we now open the capture zones
2194	we are entering. We work upwards from n
2195	until we reach p (the parent of the atom we
2196	reenter or the youngest common ancestor).
2197	*/
2198	while (n > p) {
2199	cap = eng->f.at(n).capture;
2200	if (cap >= 0) {
2201	capBegin[cap] = i;
2202	capEnd[cap] = EmptyCapture;
2203	}
2204	n = eng->f.at(n).parent;
2205	}
2206	/*
2207	If the next state was already in
2208	nextStack, we must choose carefully which
2209	capture zones we want to keep.
2210	*/
2211	if (capBegin == tempCapBegin &&
2212	isBetterCapture(ncap, capBegin, capEnd, nextCapBegin + m * ncap,
2213	nextCapEnd + m * ncap)) {
2214	memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int));
2215	memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int));
2216	}
2217	}
2218	#ifndef QT_NO_REGEXP_BACKREF
2219	/*
2220	We are done with updating the capture zones.
2221	It's now time to put the next state to sleep,
2222	if it needs to, and to remove it from
2223	nextStack.
2224	*/
2225	if (needSomeSleep > 0) {
2226	QVector<int> zzZ(2 + 2 * ncap);
2227	zzZ[0] = i + needSomeSleep;
2228	zzZ[1] = next;
2229	if (ncap > 0) {
2230	memcpy(zzZ.data() + 2, capBegin, ncap * sizeof(int));
2231	memcpy(zzZ.data() + 2 + ncap, capEnd, ncap * sizeof(int));
2232	}
2233	inNextStack[nextStack[--nnext]] = -1;
2234	sleeping.append(zzZ);
2235	}
2236	#endif
2237	#endif
2238	}
2239	}
2240	}
2241	#ifndef QT_NO_REGEXP_CAPTURE
2242	/*
2243	If we reached the final state, hurray! Copy the captured
2244	zone.
2245	*/
2246	if (ncap > 0 && (m = inNextStack[QRegExpEngine::FinalState]) != -1) {
2247	memcpy(capBegin, nextCapBegin + m * ncap, ncap * sizeof(int));
2248	memcpy(capEnd, nextCapEnd + m * ncap, ncap * sizeof(int));
2249	}
2250	#ifndef QT_NO_REGEXP_BACKREF
2251	/*
2252	It's time to wake up the sleepers.
2253	*/
2254	j = 0;
2255	while (j < sleeping.count()) {
2256	if (sleeping.at(j)[0] == i) {
2257	const QVector<int> &zzZ = sleeping.at(j);
2258	int next = zzZ[1];
2259	const int *capBegin = zzZ.data() + 2;
2260	const int *capEnd = zzZ.data() + 2 + ncap;
2261	bool copyOver = true;
2262
2263	if ((m = inNextStack[next]) == -1) {
2264	m = nnext++;
2265	nextStack[m] = next;
2266	inNextStack[next] = m;
2267	} else {
2268	copyOver = isBetterCapture(ncap, nextCapBegin + m * ncap, nextCapEnd + m * ncap,
2269	capBegin, capEnd);
2270	}
2271	if (copyOver) {
2272	memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int));
2273	memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int));
2274	}
2275
2276	sleeping.removeAt(j);
2277	} else {
2278	++j;
2279	}
2280	}
2281	#endif
2282	#endif
2283	for (j = 0; j < nnext; j++)
2284	inNextStack[nextStack[j]] = -1;
2285
2286	// avoid needless iteration that confuses oneTestMatchedLen
2287	if (nnext == 1 && nextStack[0] == QRegExpEngine::FinalState
2288	#ifndef QT_NO_REGEXP_BACKREF
2289	&& sleeping.isEmpty()
2290	#endif
2291	)
2292	stop = true;
2293
2294	qSwap(curStack, nextStack);
2295	#ifndef QT_NO_REGEXP_CAPTURE
2296	qSwap(curCapBegin, nextCapBegin);
2297	qSwap(curCapEnd, nextCapEnd);
2298	#endif
2299	ncur = nnext;
2300	nnext = 0;
2301	++i;
2302	}
2303
2304	#ifndef QT_NO_REGEXP_BACKREF
2305	/*
2306	If minimal matching is enabled, we might have some sleepers
2307	left.
2308	*/
2309	if (!sleeping.isEmpty())
2310	sleeping.clear();
2311	#endif
2312
2313	oneTestMatchedLen = i - 1;
2314	return (matchLen >= 0);
2315	}
2316
2317	#ifndef QT_NO_REGEXP_CCLASS
2318
2319	QRegExpCharClass::QRegExpCharClass()
2320	: c(0), n(false)
2321	{
2322	#ifndef QT_NO_REGEXP_OPTIM
2323	occ1.fill(NoOccurrence, NumBadChars);
2324	#endif
2325	}
2326
2327	QRegExpCharClass &QRegExpCharClass::operator=(const QRegExpCharClass &cc)
2328	{
2329	c = cc.c;
2330	r = cc.r;
2331	n = cc.n;
2332	#ifndef QT_NO_REGEXP_OPTIM
2333	occ1 = cc.occ1;
2334	#endif
2335	return *this;
2336	}
2337
2338	void QRegExpCharClass::clear()
2339	{
2340	c = 0;
2341	r.resize(0);
2342	n = false;
2343	}
2344
2345	void QRegExpCharClass::setNegative(bool negative)
2346	{
2347	n = negative;
2348	#ifndef QT_NO_REGEXP_OPTIM
2349	occ1.fill(0, NumBadChars);
2350	#endif
2351	}
2352
2353	void QRegExpCharClass::addCategories(int cats)
2354	{
2355	c \|= cats;
2356	#ifndef QT_NO_REGEXP_OPTIM
2357	occ1.fill(0, NumBadChars);
2358	#endif
2359	}
2360
2361	void QRegExpCharClass::addRange(ushort from, ushort to)
2362	{
2363	if (from > to)
2364	qSwap(from, to);
2365	int m = r.size();
2366	r.resize(m + 1);
2367	r[m].from = from;
2368	r[m].len = to - from + 1;
2369
2370	#ifndef QT_NO_REGEXP_OPTIM
2371	int i;
2372
2373	if (to - from < NumBadChars) {
2374	if (from % NumBadChars <= to % NumBadChars) {
2375	for (i = from % NumBadChars; i <= to % NumBadChars; i++)
2376	occ1[i] = 0;
2377	} else {
2378	for (i = 0; i <= to % NumBadChars; i++)
2379	occ1[i] = 0;
2380	for (i = from % NumBadChars; i < NumBadChars; i++)
2381	occ1[i] = 0;
2382	}
2383	} else {
2384	occ1.fill(0, NumBadChars);
2385	}
2386	#endif
2387	}
2388
2389	bool QRegExpCharClass::in(QChar ch) const
2390	{
2391	#ifndef QT_NO_REGEXP_OPTIM
2392	if (occ1.at(BadChar(ch)) == NoOccurrence)
2393	return n;
2394	#endif
2395
2396	if (c != 0 && (c & (1 << (int)ch.category())) != 0)
2397	return !n;
2398
2399	const int uc = ch.unicode();
2400	int size = r.size();
2401
2402	for (int i = 0; i < size; ++i) {
2403	const QRegExpCharClassRange &range = r.at(i);
2404	if (uint(uc - range.from) < uint(r.at(i).len))
2405	return !n;
2406	}
2407	return n;
2408	}
2409
2410	#if defined(QT_DEBUG)
2411	void QRegExpCharClass::dump() const
2412	{
2413	int i;
2414	qDebug(" %stive character class", n ? "nega" : "posi");
2415	#ifndef QT_NO_REGEXP_CCLASS
2416	if (c != 0)
2417	qDebug(" categories 0x%.8x", c);
2418	#endif
2419	for (i = 0; i < r.size(); i++)
2420	qDebug(" 0x%.4x through 0x%.4x", r[i].from, r[i].from + r[i].len - 1);
2421	}
2422	#endif
2423	#endif
2424
2425	QRegExpEngine::Box::Box(QRegExpEngine *engine)
2426	: eng(engine), skipanchors(0)
2427	#ifndef QT_NO_REGEXP_OPTIM
2428	, earlyStart(0), lateStart(0), maxl(0)
2429	#endif
2430	{
2431	#ifndef QT_NO_REGEXP_OPTIM
2432	occ1.fill(NoOccurrence, NumBadChars);
2433	#endif
2434	minl = 0;
2435	}
2436
2437	QRegExpEngine::Box &QRegExpEngine::Box::operator=(const Box &b)
2438	{
2439	eng = b.eng;
2440	ls = b.ls;
2441	rs = b.rs;
2442	lanchors = b.lanchors;
2443	ranchors = b.ranchors;
2444	skipanchors = b.skipanchors;
2445	#ifndef QT_NO_REGEXP_OPTIM
2446	earlyStart = b.earlyStart;
2447	lateStart = b.lateStart;
2448	str = b.str;
2449	leftStr = b.leftStr;
2450	rightStr = b.rightStr;
2451	maxl = b.maxl;
2452	occ1 = b.occ1;
2453	#endif
2454	minl = b.minl;
2455	return *this;
2456	}
2457
2458	void QRegExpEngine::Box::set(QChar ch)
2459	{
2460	ls.resize(1);
2461	ls[0] = eng->createState(ch);
2462	rs = ls;
2463	#ifndef QT_NO_REGEXP_OPTIM
2464	str = ch;
2465	leftStr = ch;
2466	rightStr = ch;
2467	maxl = 1;
2468	occ1[BadChar(ch)] = 0;
2469	#endif
2470	minl = 1;
2471	}
2472
2473	void QRegExpEngine::Box::set(const QRegExpCharClass &cc)
2474	{
2475	ls.resize(1);
2476	ls[0] = eng->createState(cc);
2477	rs = ls;
2478	#ifndef QT_NO_REGEXP_OPTIM
2479	maxl = 1;
2480	occ1 = cc.firstOccurrence();
2481	#endif
2482	minl = 1;
2483	}
2484
2485	#ifndef QT_NO_REGEXP_BACKREF
2486	void QRegExpEngine::Box::set(int bref)
2487	{
2488	ls.resize(1);
2489	ls[0] = eng->createState(bref);
2490	rs = ls;
2491	if (bref >= 1 && bref <= MaxBackRefs)
2492	skipanchors = Anchor_BackRef0Empty << bref;
2493	#ifndef QT_NO_REGEXP_OPTIM
2494	maxl = InftyLen;
2495	#endif
2496	minl = 0;
2497	}
2498	#endif
2499
2500	void QRegExpEngine::Box::cat(const Box &b)
2501	{
2502	eng->addCatTransitions(rs, b.ls);
2503	addAnchorsToEngine(b);
2504	if (minl == 0) {
2505	lanchors.unite(b.lanchors);
2506	if (skipanchors != 0) {
2507	for (int i = 0; i < b.ls.size(); i++) {
2508	int a = eng->anchorConcatenation(lanchors.value(b.ls.at(i), 0), skipanchors);
2509	lanchors.insert(b.ls.at(i), a);
2510	}
2511	}
2512	mergeInto(&ls, b.ls);
2513	}
2514	if (b.minl == 0) {
2515	ranchors.unite(b.ranchors);
2516	if (b.skipanchors != 0) {
2517	for (int i = 0; i < rs.size(); i++) {
2518	int a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), b.skipanchors);
2519	ranchors.insert(rs.at(i), a);
2520	}
2521	}
2522	mergeInto(&rs, b.rs);
2523	} else {
2524	ranchors = b.ranchors;
2525	rs = b.rs;
2526	}
2527
2528	#ifndef QT_NO_REGEXP_OPTIM
2529	if (maxl != InftyLen) {
2530	if (rightStr.length() + b.leftStr.length() >
2531	qMax(str.length(), b.str.length())) {
2532	earlyStart = minl - rightStr.length();
2533	lateStart = maxl - rightStr.length();
2534	str = rightStr + b.leftStr;
2535	} else if (b.str.length() > str.length()) {
2536	earlyStart = minl + b.earlyStart;
2537	lateStart = maxl + b.lateStart;
2538	str = b.str;
2539	}
2540	}
2541
2542	if (leftStr.length() == maxl)
2543	leftStr += b.leftStr;
2544
2545	if (b.rightStr.length() == b.maxl) {
2546	rightStr += b.rightStr;
2547	} else {
2548	rightStr = b.rightStr;
2549	}
2550
2551	if (maxl == InftyLen \|\| b.maxl == InftyLen) {
2552	maxl = InftyLen;
2553	} else {
2554	maxl += b.maxl;
2555	}
2556
2557	for (int i = 0; i < NumBadChars; i++) {
2558	if (b.occ1.at(i) != NoOccurrence && minl + b.occ1.at(i) < occ1.at(i))
2559	occ1[i] = minl + b.occ1.at(i);
2560	}
2561	#endif
2562
2563	minl += b.minl;
2564	if (minl == 0)
2565	skipanchors = eng->anchorConcatenation(skipanchors, b.skipanchors);
2566	else
2567	skipanchors = 0;
2568	}
2569
2570	void QRegExpEngine::Box::orx(const Box &b)
2571	{
2572	mergeInto(&ls, b.ls);
2573	lanchors.unite(b.lanchors);
2574	mergeInto(&rs, b.rs);
2575	ranchors.unite(b.ranchors);
2576
2577	if (b.minl == 0) {
2578	if (minl == 0)
2579	skipanchors = eng->anchorAlternation(skipanchors, b.skipanchors);
2580	else
2581	skipanchors = b.skipanchors;
2582	}
2583
2584	#ifndef QT_NO_REGEXP_OPTIM
2585	for (int i = 0; i < NumBadChars; i++) {
2586	if (occ1.at(i) > b.occ1.at(i))
2587	occ1[i] = b.occ1.at(i);
2588	}
2589	earlyStart = 0;
2590	lateStart = 0;
2591	str = QString();
2592	leftStr = QString();
2593	rightStr = QString();
2594	if (b.maxl > maxl)
2595	maxl = b.maxl;
2596	#endif
2597	if (b.minl < minl)
2598	minl = b.minl;
2599	}
2600
2601	void QRegExpEngine::Box::plus(int atom)
2602	{
2603	#ifndef QT_NO_REGEXP_CAPTURE
2604	eng->addPlusTransitions(rs, ls, atom);
2605	#else
2606	Q_UNUSED(atom);
2607	eng->addCatTransitions(rs, ls);
2608	#endif
2609	addAnchorsToEngine(*this);
2610	#ifndef QT_NO_REGEXP_OPTIM
2611	maxl = InftyLen;
2612	#endif
2613	}
2614
2615	void QRegExpEngine::Box::opt()
2616	{
2617	#ifndef QT_NO_REGEXP_OPTIM
2618	earlyStart = 0;
2619	lateStart = 0;
2620	str = QString();
2621	leftStr = QString();
2622	rightStr = QString();
2623	#endif
2624	skipanchors = 0;
2625	minl = 0;
2626	}
2627
2628	void QRegExpEngine::Box::catAnchor(int a)
2629	{
2630	if (a != 0) {
2631	for (int i = 0; i < rs.size(); i++) {
2632	a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), a);
2633	ranchors.insert(rs.at(i), a);
2634	}
2635	if (minl == 0)
2636	skipanchors = eng->anchorConcatenation(skipanchors, a);
2637	}
2638	}
2639
2640	#ifndef QT_NO_REGEXP_OPTIM
2641	void QRegExpEngine::Box::setupHeuristics()
2642	{
2643	eng->goodEarlyStart = earlyStart;
2644	eng->goodLateStart = lateStart;
2645	eng->goodStr = eng->cs ? str : str.toLower();
2646
2647	eng->minl = minl;
2648	if (eng->cs) {
2649	/*
2650	A regular expression such as 112\|1 has occ1['2'] = 2 and minl =
2651	1 at this point. An entry of occ1 has to be at most minl or
2652	infinity for the rest of the algorithm to go well.
2653
2654	We waited until here before normalizing these cases (instead of
2655	doing it in Box::orx()) because sometimes things improve by
2656	themselves. Consider for example (112\|1)34.
2657	*/
2658	for (int i = 0; i < NumBadChars; i++) {
2659	if (occ1.at(i) != NoOccurrence && occ1.at(i) >= minl)
2660	occ1[i] = minl;
2661	}
2662	eng->occ1 = occ1;
2663	} else {
2664	eng->occ1.fill(0, NumBadChars);
2665	}
2666
2667	eng->heuristicallyChooseHeuristic();
2668	}
2669	#endif
2670
2671	#if defined(QT_DEBUG)
2672	void QRegExpEngine::Box::dump() const
2673	{
2674	int i;
2675	qDebug("Box of at least %d character%s", minl, minl == 1 ? "" : "s");
2676	qDebug(" Left states:");
2677	for (i = 0; i < ls.size(); i++) {
2678	if (lanchors.value(ls[i], 0) == 0)
2679	qDebug(" %d", ls[i]);
2680	else
2681	qDebug(" %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]]);
2682	}
2683	qDebug(" Right states:");
2684	for (i = 0; i < rs.size(); i++) {
2685	if (ranchors.value(rs[i], 0) == 0)
2686	qDebug(" %d", rs[i]);
2687	else
2688	qDebug(" %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]]);
2689	}
2690	qDebug(" Skip anchors: 0x%.8x", skipanchors);
2691	}
2692	#endif
2693
2694	void QRegExpEngine::Box::addAnchorsToEngine(const Box &to) const
2695	{
2696	for (int i = 0; i < to.ls.size(); i++) {
2697	for (int j = 0; j < rs.size(); j++) {
2698	int a = eng->anchorConcatenation(ranchors.value(rs.at(j), 0),
2699	to.lanchors.value(to.ls.at(i), 0));
2700	eng->addAnchors(rs[j], to.ls[i], a);
2701	}
2702	}
2703	}
2704
2705	void QRegExpEngine::setupCategoriesRangeMap()
2706	{
2707	categoriesRangeMap.insert("IsBasicLatin", qMakePair(0x0000, 0x007F));
2708	categoriesRangeMap.insert("IsLatin-1Supplement", qMakePair(0x0080, 0x00FF));
2709	categoriesRangeMap.insert("IsLatinExtended-A", qMakePair(0x0100, 0x017F));
2710	categoriesRangeMap.insert("IsLatinExtended-B", qMakePair(0x0180, 0x024F));
2711	categoriesRangeMap.insert("IsIPAExtensions", qMakePair(0x0250, 0x02AF));
2712	categoriesRangeMap.insert("IsSpacingModifierLetters", qMakePair(0x02B0, 0x02FF));
2713	categoriesRangeMap.insert("IsCombiningDiacriticalMarks", qMakePair(0x0300, 0x036F));
2714	categoriesRangeMap.insert("IsGreek", qMakePair(0x0370, 0x03FF));
2715	categoriesRangeMap.insert("IsCyrillic", qMakePair(0x0400, 0x04FF));
2716	categoriesRangeMap.insert("IsCyrillicSupplement", qMakePair(0x0500, 0x052F));
2717	categoriesRangeMap.insert("IsArmenian", qMakePair(0x0530, 0x058F));
2718	categoriesRangeMap.insert("IsHebrew", qMakePair(0x0590, 0x05FF));
2719	categoriesRangeMap.insert("IsArabic", qMakePair(0x0600, 0x06FF));
2720	categoriesRangeMap.insert("IsSyriac", qMakePair(0x0700, 0x074F));
2721	categoriesRangeMap.insert("IsArabicSupplement", qMakePair(0x0750, 0x077F));
2722	categoriesRangeMap.insert("IsThaana", qMakePair(0x0780, 0x07BF));
2723	categoriesRangeMap.insert("IsDevanagari", qMakePair(0x0900, 0x097F));
2724	categoriesRangeMap.insert("IsBengali", qMakePair(0x0980, 0x09FF));
2725	categoriesRangeMap.insert("IsGurmukhi", qMakePair(0x0A00, 0x0A7F));
2726	categoriesRangeMap.insert("IsGujarati", qMakePair(0x0A80, 0x0AFF));
2727	categoriesRangeMap.insert("IsOriya", qMakePair(0x0B00, 0x0B7F));
2728	categoriesRangeMap.insert("IsTamil", qMakePair(0x0B80, 0x0BFF));
2729	categoriesRangeMap.insert("IsTelugu", qMakePair(0x0C00, 0x0C7F));
2730	categoriesRangeMap.insert("IsKannada", qMakePair(0x0C80, 0x0CFF));
2731	categoriesRangeMap.insert("IsMalayalam", qMakePair(0x0D00, 0x0D7F));
2732	categoriesRangeMap.insert("IsSinhala", qMakePair(0x0D80, 0x0DFF));
2733	categoriesRangeMap.insert("IsThai", qMakePair(0x0E00, 0x0E7F));
2734	categoriesRangeMap.insert("IsLao", qMakePair(0x0E80, 0x0EFF));
2735	categoriesRangeMap.insert("IsTibetan", qMakePair(0x0F00, 0x0FFF));
2736	categoriesRangeMap.insert("IsMyanmar", qMakePair(0x1000, 0x109F));
2737	categoriesRangeMap.insert("IsGeorgian", qMakePair(0x10A0, 0x10FF));
2738	categoriesRangeMap.insert("IsHangulJamo", qMakePair(0x1100, 0x11FF));
2739	categoriesRangeMap.insert("IsEthiopic", qMakePair(0x1200, 0x137F));
2740	categoriesRangeMap.insert("IsEthiopicSupplement", qMakePair(0x1380, 0x139F));
2741	categoriesRangeMap.insert("IsCherokee", qMakePair(0x13A0, 0x13FF));
2742	categoriesRangeMap.insert("IsUnifiedCanadianAboriginalSyllabics", qMakePair(0x1400, 0x167F));
2743	categoriesRangeMap.insert("IsOgham", qMakePair(0x1680, 0x169F));
2744	categoriesRangeMap.insert("IsRunic", qMakePair(0x16A0, 0x16FF));
2745	categoriesRangeMap.insert("IsTagalog", qMakePair(0x1700, 0x171F));
2746	categoriesRangeMap.insert("IsHanunoo", qMakePair(0x1720, 0x173F));
2747	categoriesRangeMap.insert("IsBuhid", qMakePair(0x1740, 0x175F));
2748	categoriesRangeMap.insert("IsTagbanwa", qMakePair(0x1760, 0x177F));
2749	categoriesRangeMap.insert("IsKhmer", qMakePair(0x1780, 0x17FF));
2750	categoriesRangeMap.insert("IsMongolian", qMakePair(0x1800, 0x18AF));
2751	categoriesRangeMap.insert("IsLimbu", qMakePair(0x1900, 0x194F));
2752	categoriesRangeMap.insert("IsTaiLe", qMakePair(0x1950, 0x197F));
2753	categoriesRangeMap.insert("IsNewTaiLue", qMakePair(0x1980, 0x19DF));
2754	categoriesRangeMap.insert("IsKhmerSymbols", qMakePair(0x19E0, 0x19FF));
2755	categoriesRangeMap.insert("IsBuginese", qMakePair(0x1A00, 0x1A1F));
2756	categoriesRangeMap.insert("IsPhoneticExtensions", qMakePair(0x1D00, 0x1D7F));
2757	categoriesRangeMap.insert("IsPhoneticExtensionsSupplement", qMakePair(0x1D80, 0x1DBF));
2758	categoriesRangeMap.insert("IsCombiningDiacriticalMarksSupplement", qMakePair(0x1DC0, 0x1DFF));
2759	categoriesRangeMap.insert("IsLatinExtendedAdditional", qMakePair(0x1E00, 0x1EFF));
2760	categoriesRangeMap.insert("IsGreekExtended", qMakePair(0x1F00, 0x1FFF));
2761	categoriesRangeMap.insert("IsGeneralPunctuation", qMakePair(0x2000, 0x206F));
2762	categoriesRangeMap.insert("IsSuperscriptsandSubscripts", qMakePair(0x2070, 0x209F));
2763	categoriesRangeMap.insert("IsCurrencySymbols", qMakePair(0x20A0, 0x20CF));
2764	categoriesRangeMap.insert("IsCombiningMarksforSymbols", qMakePair(0x20D0, 0x20FF));
2765	categoriesRangeMap.insert("IsLetterlikeSymbols", qMakePair(0x2100, 0x214F));
2766	categoriesRangeMap.insert("IsNumberForms", qMakePair(0x2150, 0x218F));
2767	categoriesRangeMap.insert("IsArrows", qMakePair(0x2190, 0x21FF));
2768	categoriesRangeMap.insert("IsMathematicalOperators", qMakePair(0x2200, 0x22FF));
2769	categoriesRangeMap.insert("IsMiscellaneousTechnical", qMakePair(0x2300, 0x23FF));
2770	categoriesRangeMap.insert("IsControlPictures", qMakePair(0x2400, 0x243F));
2771	categoriesRangeMap.insert("IsOpticalCharacterRecognition", qMakePair(0x2440, 0x245F));
2772	categoriesRangeMap.insert("IsEnclosedAlphanumerics", qMakePair(0x2460, 0x24FF));
2773	categoriesRangeMap.insert("IsBoxDrawing", qMakePair(0x2500, 0x257F));
2774	categoriesRangeMap.insert("IsBlockElements", qMakePair(0x2580, 0x259F));
2775	categoriesRangeMap.insert("IsGeometricShapes", qMakePair(0x25A0, 0x25FF));
2776	categoriesRangeMap.insert("IsMiscellaneousSymbols", qMakePair(0x2600, 0x26FF));
2777	categoriesRangeMap.insert("IsDingbats", qMakePair(0x2700, 0x27BF));
2778	categoriesRangeMap.insert("IsMiscellaneousMathematicalSymbols-A", qMakePair(0x27C0, 0x27EF));
2779	categoriesRangeMap.insert("IsSupplementalArrows-A", qMakePair(0x27F0, 0x27FF));
2780	categoriesRangeMap.insert("IsBraillePatterns", qMakePair(0x2800, 0x28FF));
2781	categoriesRangeMap.insert("IsSupplementalArrows-B", qMakePair(0x2900, 0x297F));
2782	categoriesRangeMap.insert("IsMiscellaneousMathematicalSymbols-B", qMakePair(0x2980, 0x29FF));
2783	categoriesRangeMap.insert("IsSupplementalMathematicalOperators", qMakePair(0x2A00, 0x2AFF));
2784	categoriesRangeMap.insert("IsMiscellaneousSymbolsandArrows", qMakePair(0x2B00, 0x2BFF));
2785	categoriesRangeMap.insert("IsGlagolitic", qMakePair(0x2C00, 0x2C5F));
2786	categoriesRangeMap.insert("IsCoptic", qMakePair(0x2C80, 0x2CFF));
2787	categoriesRangeMap.insert("IsGeorgianSupplement", qMakePair(0x2D00, 0x2D2F));
2788	categoriesRangeMap.insert("IsTifinagh", qMakePair(0x2D30, 0x2D7F));
2789	categoriesRangeMap.insert("IsEthiopicExtended", qMakePair(0x2D80, 0x2DDF));
2790	categoriesRangeMap.insert("IsSupplementalPunctuation", qMakePair(0x2E00, 0x2E7F));
2791	categoriesRangeMap.insert("IsCJKRadicalsSupplement", qMakePair(0x2E80, 0x2EFF));
2792	categoriesRangeMap.insert("IsKangxiRadicals", qMakePair(0x2F00, 0x2FDF));
2793	categoriesRangeMap.insert("IsIdeographicDescriptionCharacters", qMakePair(0x2FF0, 0x2FFF));
2794	categoriesRangeMap.insert("IsCJKSymbolsandPunctuation", qMakePair(0x3000, 0x303F));
2795	categoriesRangeMap.insert("IsHiragana", qMakePair(0x3040, 0x309F));
2796	categoriesRangeMap.insert("IsKatakana", qMakePair(0x30A0, 0x30FF));
2797	categoriesRangeMap.insert("IsBopomofo", qMakePair(0x3100, 0x312F));
2798	categoriesRangeMap.insert("IsHangulCompatibilityJamo", qMakePair(0x3130, 0x318F));
2799	categoriesRangeMap.insert("IsKanbun", qMakePair(0x3190, 0x319F));
2800	categoriesRangeMap.insert("IsBopomofoExtended", qMakePair(0x31A0, 0x31BF));
2801	categoriesRangeMap.insert("IsCJKStrokes", qMakePair(0x31C0, 0x31EF));
2802	categoriesRangeMap.insert("IsKatakanaPhoneticExtensions", qMakePair(0x31F0, 0x31FF));
2803	categoriesRangeMap.insert("IsEnclosedCJKLettersandMonths", qMakePair(0x3200, 0x32FF));
2804	categoriesRangeMap.insert("IsCJKCompatibility", qMakePair(0x3300, 0x33FF));
2805	categoriesRangeMap.insert("IsCJKUnifiedIdeographsExtensionA", qMakePair(0x3400, 0x4DB5));
2806	categoriesRangeMap.insert("IsYijingHexagramSymbols", qMakePair(0x4DC0, 0x4DFF));
2807	categoriesRangeMap.insert("IsCJKUnifiedIdeographs", qMakePair(0x4E00, 0x9FFF));
2808	categoriesRangeMap.insert("IsYiSyllables", qMakePair(0xA000, 0xA48F));
2809	categoriesRangeMap.insert("IsYiRadicals", qMakePair(0xA490, 0xA4CF));
2810	categoriesRangeMap.insert("IsModifierToneLetters", qMakePair(0xA700, 0xA71F));
2811	categoriesRangeMap.insert("IsSylotiNagri", qMakePair(0xA800, 0xA82F));
2812	categoriesRangeMap.insert("IsHangulSyllables", qMakePair(0xAC00, 0xD7A3));
2813	categoriesRangeMap.insert("IsPrivateUse", qMakePair(0xE000, 0xF8FF));
2814	categoriesRangeMap.insert("IsCJKCompatibilityIdeographs", qMakePair(0xF900, 0xFAFF));
2815	categoriesRangeMap.insert("IsAlphabeticPresentationForms", qMakePair(0xFB00, 0xFB4F));
2816	categoriesRangeMap.insert("IsArabicPresentationForms-A", qMakePair(0xFB50, 0xFDFF));
2817	categoriesRangeMap.insert("IsVariationSelectors", qMakePair(0xFE00, 0xFE0F));
2818	categoriesRangeMap.insert("IsVerticalForms", qMakePair(0xFE10, 0xFE1F));
2819	categoriesRangeMap.insert("IsCombiningHalfMarks", qMakePair(0xFE20, 0xFE2F));
2820	categoriesRangeMap.insert("IsCJKCompatibilityForms", qMakePair(0xFE30, 0xFE4F));
2821	categoriesRangeMap.insert("IsSmallFormVariants", qMakePair(0xFE50, 0xFE6F));
2822	categoriesRangeMap.insert("IsArabicPresentationForms-B", qMakePair(0xFE70, 0xFEFF));
2823	categoriesRangeMap.insert("IsHalfwidthandFullwidthForms", qMakePair(0xFF00, 0xFFEF));
2824	categoriesRangeMap.insert("IsSpecials", qMakePair(0xFFF0, 0xFFFF));
2825	categoriesRangeMap.insert("IsLinearBSyllabary", qMakePair(0x10000, 0x1007F));
2826	categoriesRangeMap.insert("IsLinearBIdeograms", qMakePair(0x10080, 0x100FF));
2827	categoriesRangeMap.insert("IsAegeanNumbers", qMakePair(0x10100, 0x1013F));
2828	categoriesRangeMap.insert("IsAncientGreekNumbers", qMakePair(0x10140, 0x1018F));
2829	categoriesRangeMap.insert("IsOldItalic", qMakePair(0x10300, 0x1032F));
2830	categoriesRangeMap.insert("IsGothic", qMakePair(0x10330, 0x1034F));
2831	categoriesRangeMap.insert("IsUgaritic", qMakePair(0x10380, 0x1039F));
2832	categoriesRangeMap.insert("IsOldPersian", qMakePair(0x103A0, 0x103DF));
2833	categoriesRangeMap.insert("IsDeseret", qMakePair(0x10400, 0x1044F));
2834	categoriesRangeMap.insert("IsShavian", qMakePair(0x10450, 0x1047F));
2835	categoriesRangeMap.insert("IsOsmanya", qMakePair(0x10480, 0x104AF));
2836	categoriesRangeMap.insert("IsCypriotSyllabary", qMakePair(0x10800, 0x1083F));
2837	categoriesRangeMap.insert("IsKharoshthi", qMakePair(0x10A00, 0x10A5F));
2838	categoriesRangeMap.insert("IsByzantineMusicalSymbols", qMakePair(0x1D000, 0x1D0FF));
2839	categoriesRangeMap.insert("IsMusicalSymbols", qMakePair(0x1D100, 0x1D1FF));
2840	categoriesRangeMap.insert("IsAncientGreekMusicalNotation", qMakePair(0x1D200, 0x1D24F));
2841	categoriesRangeMap.insert("IsTaiXuanJingSymbols", qMakePair(0x1D300, 0x1D35F));
2842	categoriesRangeMap.insert("IsMathematicalAlphanumericSymbols", qMakePair(0x1D400, 0x1D7FF));
2843	categoriesRangeMap.insert("IsCJKUnifiedIdeographsExtensionB", qMakePair(0x20000, 0x2A6DF));
2844	categoriesRangeMap.insert("IsCJKCompatibilityIdeographsSupplement", qMakePair(0x2F800, 0x2FA1F));
2845	categoriesRangeMap.insert("IsTags", qMakePair(0xE0000, 0xE007F));
2846	categoriesRangeMap.insert("IsVariationSelectorsSupplement", qMakePair(0xE0100, 0xE01EF));
2847	categoriesRangeMap.insert("IsSupplementaryPrivateUseArea-A", qMakePair(0xF0000, 0xFFFFF));
2848	categoriesRangeMap.insert("IsSupplementaryPrivateUseArea-B", qMakePair(0x100000, 0x10FFFF));
2849	}
2850
2851	int QRegExpEngine::getChar()
2852	{
2853	return (yyPos == yyLen) ? EOS : yyIn[yyPos++].unicode();
2854	}
2855
2856	int QRegExpEngine::getEscape()
2857	{
2858	#ifndef QT_NO_REGEXP_ESCAPE
2859	const char tab[] = "afnrtv"; // no b, as \b means word boundary
2860	const char backTab[] = "\a\f\n\r\t\v";
2861	ushort low;
2862	int i;
2863	#endif
2864	ushort val;
2865	int prevCh = yyCh;
2866
2867	if (prevCh == EOS) {
2868	error(RXERR_END);
2869	return Tok_Char \| '\\';
2870	}
2871	yyCh = getChar();
2872	#ifndef QT_NO_REGEXP_ESCAPE
2873	if ((prevCh & ~0xff) == 0) {
2874	const char *p = strchr(tab, prevCh);
2875	if (p != 0)
2876	return Tok_Char \| backTab[p - tab];
2877	}
2878	#endif
2879
2880	switch (prevCh) {
2881	#ifndef QT_NO_REGEXP_ESCAPE
2882	case '0':
2883	val = 0;
2884	for (i = 0; i < 3; i++) {
2885	if (yyCh >= '0' && yyCh <= '7')
2886	val = (val << 3) \| (yyCh - '0');
2887	else
2888	break;
2889	yyCh = getChar();
2890	}
2891	if ((val & ~0377) != 0)
2892	error(RXERR_OCTAL);
2893	return Tok_Char \| val;
2894	#endif
2895	#ifndef QT_NO_REGEXP_ESCAPE
2896	case 'B':
2897	return Tok_NonWord;
2898	#endif
2899	#ifndef QT_NO_REGEXP_CCLASS
2900	case 'D':
2901	// see QChar::isDigit()
2902	yyCharClass->addCategories(0x7fffffef);
2903	return Tok_CharClass;
2904	case 'S':
2905	// see QChar::isSpace()
2906	yyCharClass->addCategories(0x7ffff87f);
2907	yyCharClass->addRange(0x0000, 0x0008);
2908	yyCharClass->addRange(0x000e, 0x001f);
2909	yyCharClass->addRange(0x007f, 0x009f);
2910	return Tok_CharClass;
2911	case 'W':
2912	// see QChar::isLetterOrNumber() and QChar::isMark()
2913	yyCharClass->addCategories(0x7fe07f81);
2914	yyCharClass->addRange(0x203f, 0x2040);
2915	yyCharClass->addSingleton(0x2040);
2916	yyCharClass->addSingleton(0x2054);
2917	yyCharClass->addSingleton(0x30fb);
2918	yyCharClass->addRange(0xfe33, 0xfe34);
2919	yyCharClass->addRange(0xfe4d, 0xfe4f);
2920	yyCharClass->addSingleton(0xff3f);
2921	yyCharClass->addSingleton(0xff65);
2922	return Tok_CharClass;
2923	#endif
2924	#ifndef QT_NO_REGEXP_ESCAPE
2925	case 'b':
2926	return Tok_Word;
2927	#endif
2928	#ifndef QT_NO_REGEXP_CCLASS
2929	case 'd':
2930	// see QChar::isDigit()
2931	yyCharClass->addCategories(0x00000010);
2932	return Tok_CharClass;
2933	case 's':
2934	// see QChar::isSpace()
2935	yyCharClass->addCategories(0x00000380);
2936	yyCharClass->addRange(0x0009, 0x000d);
2937	return Tok_CharClass;
2938	case 'w':
2939	// see QChar::isLetterOrNumber() and QChar::isMark()
2940	yyCharClass->addCategories(0x000f807e);
2941	yyCharClass->addSingleton(0x005f); // '_'
2942	return Tok_CharClass;
2943	case 'I':
2944	if (xmlSchemaExtensions) {
2945	yyCharClass->setNegative(!yyCharClass->negative());
2946	// fall through
2947	}
2948	case 'i':
2949	if (xmlSchemaExtensions) {
2950	yyCharClass->addCategories(0x000f807e);
2951	yyCharClass->addSingleton(0x003a); // ':'
2952	yyCharClass->addSingleton(0x005f); // '_'
2953	yyCharClass->addRange(0x0041, 0x005a); // [A-Z]
2954	yyCharClass->addRange(0x0061, 0x007a); // [a-z]
2955	yyCharClass->addRange(0xc0, 0xd6);
2956	yyCharClass->addRange(0xd8, 0xf6);
2957	yyCharClass->addRange(0xf8, 0x2ff);
2958	yyCharClass->addRange(0x370, 0x37d);
2959	yyCharClass->addRange(0x37f, 0x1fff);
2960	yyCharClass->addRange(0x200c, 0x200d);
2961	yyCharClass->addRange(0x2070, 0x218f);
2962	yyCharClass->addRange(0x2c00, 0x2fef);
2963	yyCharClass->addRange(0x3001, 0xd7ff);
2964	yyCharClass->addRange(0xf900, 0xfdcf);
2965	yyCharClass->addRange(0xfdf0, 0xfffd);
2966	yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff);
2967	}
2968	return Tok_CharClass;
2969	case 'C':
2970	if (xmlSchemaExtensions) {
2971	yyCharClass->setNegative(!yyCharClass->negative());
2972	// fall through
2973	}
2974	case 'c':
2975	if (xmlSchemaExtensions) {
2976	yyCharClass->addCategories(0x000f807e);
2977	yyCharClass->addSingleton(0x002d); // '-'
2978	yyCharClass->addSingleton(0x002e); // '.'
2979	yyCharClass->addSingleton(0x003a); // ':'
2980	yyCharClass->addSingleton(0x005f); // '_'
2981	yyCharClass->addSingleton(0xb7);
2982	yyCharClass->addRange(0x0030, 0x0039); // [0-9]
2983	yyCharClass->addRange(0x0041, 0x005a); // [A-Z]
2984	yyCharClass->addRange(0x0061, 0x007a); // [a-z]
2985	yyCharClass->addRange(0xc0, 0xd6);
2986	yyCharClass->addRange(0xd8, 0xf6);
2987	yyCharClass->addRange(0xf8, 0x2ff);
2988	yyCharClass->addRange(0x370, 0x37d);
2989	yyCharClass->addRange(0x37f, 0x1fff);
2990	yyCharClass->addRange(0x200c, 0x200d);
2991	yyCharClass->addRange(0x2070, 0x218f);
2992	yyCharClass->addRange(0x2c00, 0x2fef);
2993	yyCharClass->addRange(0x3001, 0xd7ff);
2994	yyCharClass->addRange(0xf900, 0xfdcf);
2995	yyCharClass->addRange(0xfdf0, 0xfffd);
2996	yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff);
2997	yyCharClass->addRange(0x0300, 0x036f);
2998	yyCharClass->addRange(0x203f, 0x2040);
2999	}
3000	return Tok_CharClass;
3001	case 'P':
3002	if (xmlSchemaExtensions) {
3003	yyCharClass->setNegative(!yyCharClass->negative());
3004	// fall through
3005	}
3006	case 'p':
3007	if (xmlSchemaExtensions) {
3008	if (yyCh != '{') {
3009	error(RXERR_CHARCLASS);
3010	return Tok_CharClass;
3011	}
3012
3013	QByteArray category;
3014	yyCh = getChar();
3015	while (yyCh != '}') {
3016	if (yyCh == EOS) {
3017	error(RXERR_END);
3018	return Tok_CharClass;
3019	}
3020	category.append(yyCh);
3021	yyCh = getChar();
3022	}
3023	yyCh = getChar(); // skip closing '}'
3024
3025	if (category == "M") {
3026	yyCharClass->addCategories(0x0000000e);
3027	} else if (category == "Mn") {
3028	yyCharClass->addCategories(0x00000002);
3029	} else if (category == "Mc") {
3030	yyCharClass->addCategories(0x00000004);
3031	} else if (category == "Me") {
3032	yyCharClass->addCategories(0x00000008);
3033	} else if (category == "N") {
3034	yyCharClass->addCategories(0x00000070);
3035	} else if (category == "Nd") {
3036	yyCharClass->addCategories(0x00000010);
3037	} else if (category == "Nl") {
3038	yyCharClass->addCategories(0x00000020);
3039	} else if (category == "No") {
3040	yyCharClass->addCategories(0x00000040);
3041	} else if (category == "Z") {
3042	yyCharClass->addCategories(0x00000380);
3043	} else if (category == "Zs") {
3044	yyCharClass->addCategories(0x00000080);
3045	} else if (category == "Zl") {
3046	yyCharClass->addCategories(0x00000100);
3047	} else if (category == "Zp") {
3048	yyCharClass->addCategories(0x00000200);
3049	} else if (category == "C") {
3050	yyCharClass->addCategories(0x00006c00);
3051	} else if (category == "Cc") {
3052	yyCharClass->addCategories(0x00000400);
3053	} else if (category == "Cf") {
3054	yyCharClass->addCategories(0x00000800);
3055	} else if (category == "Cs") {
3056	yyCharClass->addCategories(0x00001000);
3057	} else if (category == "Co") {
3058	yyCharClass->addCategories(0x00002000);
3059	} else if (category == "Cn") {
3060	yyCharClass->addCategories(0x00004000);
3061	} else if (category == "L") {
3062	yyCharClass->addCategories(0x000f8000);
3063	} else if (category == "Lu") {
3064	yyCharClass->addCategories(0x00008000);
3065	} else if (category == "Ll") {
3066	yyCharClass->addCategories(0x00010000);
3067	} else if (category == "Lt") {
3068	yyCharClass->addCategories(0x00020000);
3069	} else if (category == "Lm") {
3070	yyCharClass->addCategories(0x00040000);
3071	} else if (category == "Lo") {
3072	yyCharClass->addCategories(0x00080000);
3073	} else if (category == "P") {
3074	yyCharClass->addCategories(0x4f580780);
3075	} else if (category == "Pc") {
3076	yyCharClass->addCategories(0x00100000);
3077	} else if (category == "Pd") {
3078	yyCharClass->addCategories(0x00200000);
3079	} else if (category == "Ps") {
3080	yyCharClass->addCategories(0x00400000);
3081	} else if (category == "Pe") {
3082	yyCharClass->addCategories(0x00800000);
3083	} else if (category == "Pi") {
3084	yyCharClass->addCategories(0x01000000);
3085	} else if (category == "Pf") {
3086	yyCharClass->addCategories(0x02000000);
3087	} else if (category == "Po") {
3088	yyCharClass->addCategories(0x04000000);
3089	} else if (category == "S") {
3090	yyCharClass->addCategories(0x78000000);
3091	} else if (category == "Sm") {
3092	yyCharClass->addCategories(0x08000000);
3093	} else if (category == "Sc") {
3094	yyCharClass->addCategories(0x10000000);
3095	} else if (category == "Sk") {
3096	yyCharClass->addCategories(0x20000000);
3097	} else if (category == "So") {
3098	yyCharClass->addCategories(0x40000000);
3099	} else if (category.startsWith("Is")) {
3100	if (categoriesRangeMap.isEmpty())
3101	setupCategoriesRangeMap();
3102
3103	if (categoriesRangeMap.contains(category)) {
3104	const QPair<int, int> range = categoriesRangeMap.value(category);
3105	yyCharClass->addRange(range.first, range.second);
3106	} else {
3107	error(RXERR_CATEGORY);
3108	}
3109	} else {
3110	error(RXERR_CATEGORY);
3111	}
3112	}
3113	return Tok_CharClass;
3114	#endif
3115	#ifndef QT_NO_REGEXP_ESCAPE
3116	case 'x':
3117	val = 0;
3118	for (i = 0; i < 4; i++) {
3119	low = QChar(yyCh).toLower().unicode();
3120	if (low >= '0' && low <= '9')
3121	val = (val << 4) \| (low - '0');
3122	else if (low >= 'a' && low <= 'f')
3123	val = (val << 4) \| (low - 'a' + 10);
3124	else
3125	break;
3126	yyCh = getChar();
3127	}
3128	return Tok_Char \| val;
3129	#endif
3130	default:
3131	if (prevCh >= '1' && prevCh <= '9') {
3132	#ifndef QT_NO_REGEXP_BACKREF
3133	val = prevCh - '0';
3134	while (yyCh >= '0' && yyCh <= '9') {
3135	val = (val * 10) + (yyCh - '0');
3136	yyCh = getChar();
3137	}
3138	return Tok_BackRef \| val;
3139	#else
3140	error(RXERR_DISABLED);
3141	#endif
3142	}
3143	return Tok_Char \| prevCh;
3144	}
3145	}
3146
3147	#ifndef QT_NO_REGEXP_INTERVAL
3148	int QRegExpEngine::getRep(int def)
3149	{
3150	if (yyCh >= '0' && yyCh <= '9') {
3151	int rep = 0;
3152	do {
3153	rep = 10 * rep + yyCh - '0';
3154	if (rep >= InftyRep) {
3155	error(RXERR_REPETITION);
3156	rep = def;
3157	}
3158	yyCh = getChar();
3159	} while (yyCh >= '0' && yyCh <= '9');
3160	return rep;
3161	} else {
3162	return def;
3163	}
3164	}
3165	#endif
3166
3167	#ifndef QT_NO_REGEXP_LOOKAHEAD
3168	void QRegExpEngine::skipChars(int n)
3169	{
3170	if (n > 0) {
3171	yyPos += n - 1;
3172	yyCh = getChar();
3173	}
3174	}
3175	#endif
3176
3177	void QRegExpEngine::error(const char *msg)
3178	{
3179	if (yyError.isEmpty())
3180	yyError = QLatin1String(msg);
3181	}
3182
3183	void QRegExpEngine::startTokenizer(const QChar *rx, int len)
3184	{
3185	yyIn = rx;
3186	yyPos0 = 0;
3187	yyPos = 0;
3188	yyLen = len;
3189	yyCh = getChar();
3190	yyCharClass.reset(new QRegExpCharClass);
3191	yyMinRep = 0;
3192	yyMaxRep = 0;
3193	yyError = QString();
3194	}
3195
3196	int QRegExpEngine::getToken()
3197	{
3198	#ifndef QT_NO_REGEXP_CCLASS
3199	ushort pendingCh = 0;
3200	bool charPending;
3201	bool rangePending;
3202	int tok;
3203	#endif
3204	int prevCh = yyCh;
3205
3206	yyPos0 = yyPos - 1;
3207	#ifndef QT_NO_REGEXP_CCLASS
3208	yyCharClass->clear();
3209	#endif
3210	yyMinRep = 0;
3211	yyMaxRep = 0;
3212	yyCh = getChar();
3213
3214	switch (prevCh) {
3215	case EOS:
3216	yyPos0 = yyPos;
3217	return Tok_Eos;
3218	case '$':
3219	return Tok_Dollar;
3220	case '(':
3221	if (yyCh == '?') {
3222	prevCh = getChar();
3223	yyCh = getChar();
3224	switch (prevCh) {
3225	#ifndef QT_NO_REGEXP_LOOKAHEAD
3226	case '!':
3227	return Tok_NegLookahead;
3228	case '=':
3229	return Tok_PosLookahead;
3230	#endif
3231	case ':':
3232	return Tok_MagicLeftParen;
3233	default:
3234	error(RXERR_LOOKAHEAD);
3235	return Tok_MagicLeftParen;
3236	}
3237	} else {
3238	return Tok_LeftParen;
3239	}
3240	case ')':
3241	return Tok_RightParen;
3242	case '*':
3243	yyMinRep = 0;
3244	yyMaxRep = InftyRep;
3245	return Tok_Quantifier;
3246	case '+':
3247	yyMinRep = 1;
3248	yyMaxRep = InftyRep;
3249	return Tok_Quantifier;
3250	case '.':
3251	#ifndef QT_NO_REGEXP_CCLASS
3252	yyCharClass->setNegative(true);
3253	#endif
3254	return Tok_CharClass;
3255	case '?':
3256	yyMinRep = 0;
3257	yyMaxRep = 1;
3258	return Tok_Quantifier;
3259	case '[':
3260	#ifndef QT_NO_REGEXP_CCLASS
3261	if (yyCh == '^') {
3262	yyCharClass->setNegative(true);
3263	yyCh = getChar();
3264	}
3265	charPending = false;
3266	rangePending = false;
3267	do {
3268	if (yyCh == '-' && charPending && !rangePending) {
3269	rangePending = true;
3270	yyCh = getChar();
3271	} else {
3272	if (charPending && !rangePending) {
3273	yyCharClass->addSingleton(pendingCh);
3274	charPending = false;
3275	}
3276	if (yyCh == '\\') {
3277	yyCh = getChar();
3278	tok = getEscape();
3279	if (tok == Tok_Word)
3280	tok = '\b';
3281	} else {
3282	tok = Tok_Char \| yyCh;
3283	yyCh = getChar();
3284	}
3285	if (tok == Tok_CharClass) {
3286	if (rangePending) {
3287	yyCharClass->addSingleton('-');
3288	yyCharClass->addSingleton(pendingCh);
3289	charPending = false;
3290	rangePending = false;
3291	}
3292	} else if ((tok & Tok_Char) != 0) {
3293	if (rangePending) {
3294	yyCharClass->addRange(pendingCh, tok ^ Tok_Char);
3295	charPending = false;
3296	rangePending = false;
3297	} else {
3298	pendingCh = tok ^ Tok_Char;
3299	charPending = true;
3300	}
3301	} else {
3302	error(RXERR_CHARCLASS);
3303	}
3304	}
3305	} while (yyCh != ']' && yyCh != EOS);
3306	if (rangePending)
3307	yyCharClass->addSingleton('-');
3308	if (charPending)
3309	yyCharClass->addSingleton(pendingCh);
3310	if (yyCh == EOS)
3311	error(RXERR_END);
3312	else
3313	yyCh = getChar();
3314	return Tok_CharClass;
3315	#else
3316	error(RXERR_END);
3317	return Tok_Char \| '[';
3318	#endif
3319	case '\\':
3320	return getEscape();
3321	case ']':
3322	error(RXERR_LEFTDELIM);
3323	return Tok_Char \| ']';
3324	case '^':
3325	return Tok_Caret;
3326	case '{':
3327	#ifndef QT_NO_REGEXP_INTERVAL
3328	yyMinRep = getRep(0);
3329	yyMaxRep = yyMinRep;
3330	if (yyCh == ',') {
3331	yyCh = getChar();
3332	yyMaxRep = getRep(InftyRep);
3333	}
3334	if (yyMaxRep < yyMinRep)
3335	error(RXERR_INTERVAL);
3336	if (yyCh != '}')
3337	error(RXERR_REPETITION);
3338	yyCh = getChar();
3339	return Tok_Quantifier;
3340	#else
3341	error(RXERR_DISABLED);
3342	return Tok_Char \| '{';
3343	#endif
3344	case '\|':
3345	return Tok_Bar;
3346	case '}':
3347	error(RXERR_LEFTDELIM);
3348	return Tok_Char \| '}';
3349	default:
3350	return Tok_Char \| prevCh;
3351	}
3352	}
3353
3354	int QRegExpEngine::parse(const QChar *pattern, int len)
3355	{
3356	valid = true;
3357	startTokenizer(pattern, len);
3358	yyTok = getToken();
3359	#ifndef QT_NO_REGEXP_CAPTURE
3360	yyMayCapture = true;
3361	#else
3362	yyMayCapture = false;
3363	#endif
3364
3365	#ifndef QT_NO_REGEXP_CAPTURE
3366	int atom = startAtom(false);
3367	#endif
3368	QRegExpCharClass anything;
3369	Box box(this); // create InitialState
3370	box.set(anything);
3371	Box rightBox(this); // create FinalState
3372	rightBox.set(anything);
3373
3374	Box middleBox(this);
3375	parseExpression(&middleBox);
3376	#ifndef QT_NO_REGEXP_CAPTURE
3377	finishAtom(atom, false);
3378	#endif
3379	#ifndef QT_NO_REGEXP_OPTIM
3380	middleBox.setupHeuristics();
3381	#endif
3382	box.cat(middleBox);
3383	box.cat(rightBox);
3384	yyCharClass.reset(0);
3385
3386	#ifndef QT_NO_REGEXP_CAPTURE
3387	for (int i = 0; i < nf; ++i) {
3388	switch (f[i].capture) {
3389	case QRegExpAtom::NoCapture:
3390	break;
3391	case QRegExpAtom::OfficialCapture:
3392	f[i].capture = ncap;
3393	captureForOfficialCapture.append(ncap);
3394	++ncap;
3395	++officialncap;
3396	break;
3397	case QRegExpAtom::UnofficialCapture:
3398	f[i].capture = greedyQuantifiers ? ncap++ : QRegExpAtom::NoCapture;
3399	}
3400	}
3401
3402	#ifndef QT_NO_REGEXP_BACKREF
3403	#ifndef QT_NO_REGEXP_OPTIM
3404	if (officialncap == 0 && nbrefs == 0) {
3405	ncap = nf = 0;
3406	f.clear();
3407	}
3408	#endif
3409	// handle the case where there's a \5 with no corresponding capture
3410	// (captureForOfficialCapture.size() != officialncap)
3411	for (int i = 0; i < nbrefs - officialncap; ++i) {
3412	captureForOfficialCapture.append(ncap);
3413	++ncap;
3414	}
3415	#endif
3416	#endif
3417
3418	if (!yyError.isEmpty())
3419	return -1;
3420
3421	#ifndef QT_NO_REGEXP_OPTIM
3422	const QRegExpAutomatonState &sinit = s.at(InitialState);
3423	caretAnchored = !sinit.anchors.isEmpty();
3424	if (caretAnchored) {
3425	const QMap<int, int> &anchors = sinit.anchors;
3426	QMap<int, int>::const_iterator a;
3427	for (a = anchors.constBegin(); a != anchors.constEnd(); ++a) {
3428	if (
3429	#ifndef QT_NO_REGEXP_ANCHOR_ALT
3430	(*a & Anchor_Alternation) != 0 \|\|
3431	#endif
3432	(*a & Anchor_Caret) == 0)
3433	{
3434	caretAnchored = false;
3435	break;
3436	}
3437	}
3438	}
3439	#endif
3440
3441	// cleanup anchors
3442	int numStates = s.count();
3443	for (int i = 0; i < numStates; ++i) {
3444	QRegExpAutomatonState &state = s[i];
3445	if (!state.anchors.isEmpty()) {
3446	QMap<int, int>::iterator a = state.anchors.begin();
3447	while (a != state.anchors.end()) {
3448	if (a.value() == 0)
3449	a = state.anchors.erase(a);
3450	else
3451	++a;
3452	}
3453	}
3454	}
3455
3456	return yyPos0;
3457	}
3458
3459	void QRegExpEngine::parseAtom(Box *box)
3460	{
3461	#ifndef QT_NO_REGEXP_LOOKAHEAD
3462	QRegExpEngine *eng = 0;
3463	bool neg;
3464	int len;
3465	#endif
3466
3467	if ((yyTok & Tok_Char) != 0) {
3468	box->set(QChar(yyTok ^ Tok_Char));
3469	} else {
3470	#ifndef QT_NO_REGEXP_OPTIM
3471	trivial = false;
3472	#endif
3473	switch (yyTok) {
3474	case Tok_Dollar:
3475	box->catAnchor(Anchor_Dollar);
3476	break;
3477	case Tok_Caret:
3478	box->catAnchor(Anchor_Caret);
3479	break;
3480	#ifndef QT_NO_REGEXP_LOOKAHEAD
3481	case Tok_PosLookahead:
3482	case Tok_NegLookahead:
3483	neg = (yyTok == Tok_NegLookahead);
3484	eng = new QRegExpEngine(cs, greedyQuantifiers);
3485	len = eng->parse(yyIn + yyPos - 1, yyLen - yyPos + 1);
3486	if (len >= 0)
3487	skipChars(len);
3488	else
3489	error(RXERR_LOOKAHEAD);
3490	box->catAnchor(addLookahead(eng, neg));
3491	yyTok = getToken();
3492	if (yyTok != Tok_RightParen)
3493	error(RXERR_LOOKAHEAD);
3494	break;
3495	#endif
3496	#ifndef QT_NO_REGEXP_ESCAPE
3497	case Tok_Word:
3498	box->catAnchor(Anchor_Word);
3499	break;
3500	case Tok_NonWord:
3501	box->catAnchor(Anchor_NonWord);
3502	break;
3503	#endif
3504	case Tok_LeftParen:
3505	case Tok_MagicLeftParen:
3506	yyTok = getToken();
3507	parseExpression(box);
3508	if (yyTok != Tok_RightParen)
3509	error(RXERR_END);
3510	break;
3511	case Tok_CharClass:
3512	box->set(*yyCharClass);
3513	break;
3514	case Tok_Quantifier:
3515	error(RXERR_REPETITION);
3516	break;
3517	default:
3518	#ifndef QT_NO_REGEXP_BACKREF
3519	if ((yyTok & Tok_BackRef) != 0)
3520	box->set(yyTok ^ Tok_BackRef);
3521	else
3522	#endif
3523	error(RXERR_DISABLED);
3524	}
3525	}
3526	yyTok = getToken();
3527	}
3528
3529	void QRegExpEngine::parseFactor(Box *box)
3530	{
3531	#ifndef QT_NO_REGEXP_CAPTURE
3532	int outerAtom = greedyQuantifiers ? startAtom(false) : -1;
3533	int innerAtom = startAtom(yyMayCapture && yyTok == Tok_LeftParen);
3534	bool magicLeftParen = (yyTok == Tok_MagicLeftParen);
3535	#else
3536	const int innerAtom = -1;
3537	#endif
3538
3539	#ifndef QT_NO_REGEXP_INTERVAL
3540	#define YYREDO() \
3541	yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \
3542	*yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok
3543
3544	const QChar *in = yyIn;
3545	int pos0 = yyPos0;
3546	int pos = yyPos;
3547	int len = yyLen;
3548	int ch = yyCh;
3549	QRegExpCharClass charClass;
3550	if (yyTok == Tok_CharClass)
3551	charClass = *yyCharClass;
3552	int tok = yyTok;
3553	bool mayCapture = yyMayCapture;
3554	#endif
3555
3556	parseAtom(box);
3557	#ifndef QT_NO_REGEXP_CAPTURE
3558	finishAtom(innerAtom, magicLeftParen);
3559	#endif
3560
3561	bool hasQuantifier = (yyTok == Tok_Quantifier);
3562	if (hasQuantifier) {
3563	#ifndef QT_NO_REGEXP_OPTIM
3564	trivial = false;
3565	#endif
3566	if (yyMaxRep == InftyRep) {
3567	box->plus(innerAtom);
3568	#ifndef QT_NO_REGEXP_INTERVAL
3569	} else if (yyMaxRep == 0) {
3570	box->clear();
3571	#endif
3572	}
3573	if (yyMinRep == 0)
3574	box->opt();
3575
3576	#ifndef QT_NO_REGEXP_INTERVAL
3577	yyMayCapture = false;
3578	int alpha = (yyMinRep == 0) ? 0 : yyMinRep - 1;
3579	int beta = (yyMaxRep == InftyRep) ? 0 : yyMaxRep - (alpha + 1);
3580
3581	Box rightBox(this);
3582	int i;
3583
3584	for (i = 0; i < beta; i++) {
3585	YYREDO();
3586	Box leftBox(this);
3587	parseAtom(&leftBox);
3588	leftBox.cat(rightBox);
3589	leftBox.opt();
3590	rightBox = leftBox;
3591	}
3592	for (i = 0; i < alpha; i++) {
3593	YYREDO();
3594	Box leftBox(this);
3595	parseAtom(&leftBox);
3596	leftBox.cat(rightBox);
3597	rightBox = leftBox;
3598	}
3599	rightBox.cat(*box);
3600	*box = rightBox;
3601	#endif
3602	yyTok = getToken();
3603	#ifndef QT_NO_REGEXP_INTERVAL
3604	yyMayCapture = mayCapture;
3605	#endif
3606	}
3607	#undef YYREDO
3608	#ifndef QT_NO_REGEXP_CAPTURE
3609	if (greedyQuantifiers)
3610	finishAtom(outerAtom, hasQuantifier);
3611	#endif
3612	}
3613
3614	void QRegExpEngine::parseTerm(Box *box)
3615	{
3616	#ifndef QT_NO_REGEXP_OPTIM
3617	if (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar)
3618	parseFactor(box);
3619	#endif
3620	while (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) {
3621	Box rightBox(this);
3622	parseFactor(&rightBox);
3623	box->cat(rightBox);
3624	}
3625	}
3626
3627	void QRegExpEngine::parseExpression(Box *box)
3628	{
3629	parseTerm(box);
3630	while (yyTok == Tok_Bar) {
3631	#ifndef QT_NO_REGEXP_OPTIM
3632	trivial = false;
3633	#endif
3634	Box rightBox(this);
3635	yyTok = getToken();
3636	parseTerm(&rightBox);
3637	box->orx(rightBox);
3638	}
3639	}
3640
3641	/*
3642	The struct QRegExpPrivate contains the private data of a regular
3643	expression other than the automaton. It makes it possible for many
3644	QRegExp objects to use the same QRegExpEngine object with different
3645	QRegExpPrivate objects.
3646	*/
3647	struct QRegExpPrivate
3648	{
3649	QRegExpEngine *eng;
3650	QRegExpEngineKey engineKey;
3651	bool minimal;
3652	#ifndef QT_NO_REGEXP_CAPTURE
3653	QString t; // last string passed to QRegExp::indexIn() or lastIndexIn()
3654	QStringList capturedCache; // what QRegExp::capturedTexts() returned last
3655	#endif
3656	QRegExpMatchState matchState;
3657
3658	inline QRegExpPrivate()
3659	: eng(0), engineKey(QString(), QRegExp::RegExp, Qt::CaseSensitive), minimal(false) { }
3660	inline QRegExpPrivate(const QRegExpEngineKey &key)
3661	: eng(0), engineKey(key), minimal(false) {}
3662	};
3663
3664	#if !defined(QT_NO_REGEXP_OPTIM)
3665	uint qHash(const QRegExpEngineKey &key)
3666	{
3667	return qHash(key.pattern);
3668	}
3669
3670	typedef QCache<QRegExpEngineKey, QRegExpEngine> EngineCache;
3671	Q_GLOBAL_STATIC(EngineCache, globalEngineCache)
3672	Q_GLOBAL_STATIC(QMutex, mutex)
3673	#endif // QT_NO_REGEXP_OPTIM
3674
3675	static void derefEngine(QRegExpEngine *eng, const QRegExpEngineKey &key)
3676	{
3677	if (!eng->ref.deref()) {
3678	#if !defined(QT_NO_REGEXP_OPTIM)
3679	if (globalEngineCache()) {
3680	QMutexLocker locker(mutex());
3681	QT_TRY {
3682	globalEngineCache()->insert(key, eng, 4 + key.pattern.length() / 4);
3683	} QT_CATCH(const std::bad_alloc &) {
3684	// in case of an exception (e.g. oom), just delete the engine
3685	delete eng;
3686	}
3687	} else {
3688	delete eng;
3689	}
3690	#else
3691	Q_UNUSED(key);
3692	delete eng;
3693	#endif
3694	}
3695	}
3696
3697	static void prepareEngine_helper(QRegExpPrivate *priv)
3698	{
3699	bool initMatchState = !priv->eng;
3700	#if !defined(QT_NO_REGEXP_OPTIM)
3701	if (!priv->eng && globalEngineCache()) {
3702	QMutexLocker locker(mutex());
3703	priv->eng = globalEngineCache()->take(priv->engineKey);
3704	if (priv->eng != 0)
3705	priv->eng->ref.ref();
3706	}
3707	#endif // QT_NO_REGEXP_OPTIM
3708
3709	if (!priv->eng)
3710	priv->eng = new QRegExpEngine(priv->engineKey);
3711
3712	if (initMatchState)
3713	priv->matchState.prepareForMatch(priv->eng);
3714	}
3715
3716	inline static void prepareEngine(QRegExpPrivate *priv)
3717	{
3718	if (priv->eng)
3719	return;
3720	prepareEngine_helper(priv);
3721	}
3722
3723	static void prepareEngineForMatch(QRegExpPrivate *priv, const QString &str)
3724	{
3725	prepareEngine(priv);
3726	priv->matchState.prepareForMatch(priv->eng);
3727	#ifndef QT_NO_REGEXP_CAPTURE
3728	priv->t = str;
3729	priv->capturedCache.clear();
3730	#else
3731	Q_UNUSED(str);
3732	#endif
3733	}
3734
3735	static void invalidateEngine(QRegExpPrivate *priv)
3736	{
3737	if (priv->eng != 0) {
3738	derefEngine(priv->eng, priv->engineKey);
3739	priv->eng = 0;
3740	priv->matchState.drain();
3741	}
3742	}
3743
3744	/*!
3745	\enum QRegExp::CaretMode
3746
3747	The CaretMode enum defines the different meanings of the caret
3748	(\bold{^}) in a regular expression. The possible values are:
3749
3750	\value CaretAtZero
3751	The caret corresponds to index 0 in the searched string.
3752
3753	\value CaretAtOffset
3754	The caret corresponds to the start offset of the search.
3755
3756	\value CaretWontMatch
3757	The caret never matches.
3758	*/
3759
3760	/*!
3761	\enum QRegExp::PatternSyntax
3762
3763	The syntax used to interpret the meaning of the pattern.
3764
3765	\value RegExp A rich Perl-like pattern matching syntax. This is
3766	the default.
3767
3768	\value RegExp2 Like RegExp, but with \l{greedy quantifiers}. This
3769	will be the default in Qt 5. (Introduced in Qt 4.2.)
3770
3771	\value Wildcard This provides a simple pattern matching syntax
3772	similar to that used by shells (command interpreters) for "file
3773	globbing". See \l{Wildcard Matching}.
3774
3775	\value WildcardUnix This is similar to Wildcard but with the
3776	behavior of a Unix shell. The wildcard characters can be escaped
3777	with the character "\".
3778
3779	\value FixedString The pattern is a fixed string. This is
3780	equivalent to using the RegExp pattern on a string in
3781	which all metacharacters are escaped using escape().
3782
3783	\value W3CXmlSchema11 The pattern is a regular expression as
3784	defined by the W3C XML Schema 1.1 specification.
3785
3786	\sa setPatternSyntax()
3787	*/
3788
3789	/*!
3790	Constructs an empty regexp.
3791
3792	\sa isValid(), errorString()
3793	*/
3794	QRegExp::QRegExp()
3795	{
3796	priv = new QRegExpPrivate;
3797	}
3798
3799	/*!
3800	Constructs a regular expression object for the given \a pattern
3801	string. The pattern must be given using wildcard notation if \a
3802	syntax is \l Wildcard; the default is \l RegExp. The pattern is
3803	case sensitive, unless \a cs is Qt::CaseInsensitive. Matching is
3804	greedy (maximal), but can be changed by calling
3805	setMinimal().
3806
3807	\sa setPattern(), setCaseSensitivity(), setPatternSyntax()
3808	*/
3809	QRegExp::QRegExp(const QString &pattern, Qt::CaseSensitivity cs, PatternSyntax syntax)
3810	{
3811	priv = new QRegExpPrivate(QRegExpEngineKey(pattern, syntax, cs));
3812	}
3813
3814	/*!
3815	Constructs a regular expression as a copy of \a rx.
3816
3817	\sa operator=()
3818	*/
3819	QRegExp::QRegExp(const QRegExp &rx)
3820	{
3821	priv = new QRegExpPrivate;
3822	operator=(rx);
3823	}
3824
3825	/*!
3826	Destroys the regular expression and cleans up its internal data.
3827	*/
3828	QRegExp::~QRegExp()
3829	{
3830	invalidateEngine(priv);
3831	delete priv;
3832	}
3833
3834	/*!
3835	Copies the regular expression \a rx and returns a reference to the
3836	copy. The case sensitivity, wildcard, and minimal matching options
3837	are also copied.
3838	*/
3839	QRegExp &QRegExp::operator=(const QRegExp &rx)
3840	{
3841	prepareEngine(rx.priv); // to allow sharing
3842	QRegExpEngine *otherEng = rx.priv->eng;
3843	if (otherEng)
3844	otherEng->ref.ref();
3845	invalidateEngine(priv);
3846	priv->eng = otherEng;
3847	priv->engineKey = rx.priv->engineKey;
3848	priv->minimal = rx.priv->minimal;
3849	#ifndef QT_NO_REGEXP_CAPTURE
3850	priv->t = rx.priv->t;
3851	priv->capturedCache = rx.priv->capturedCache;
3852	#endif
3853	if (priv->eng)
3854	priv->matchState.prepareForMatch(priv->eng);
3855	priv->matchState.captured = rx.priv->matchState.captured;
3856	return *this;
3857	}
3858
3859	/*!
3860	Returns true if this regular expression is equal to \a rx;
3861	otherwise returns false.
3862
3863	Two QRegExp objects are equal if they have the same pattern
3864	strings and the same settings for case sensitivity, wildcard and
3865	minimal matching.
3866	*/
3867	bool QRegExp::operator==(const QRegExp &rx) const
3868	{
3869	return priv->engineKey == rx.priv->engineKey && priv->minimal == rx.priv->minimal;
3870	}
3871
3872	/*!
3873	\fn bool QRegExp::operator!=(const QRegExp &rx) const
3874
3875	Returns true if this regular expression is not equal to \a rx;
3876	otherwise returns false.
3877
3878	\sa operator==()
3879	*/
3880
3881	/*!
3882	Returns true if the pattern string is empty; otherwise returns
3883	false.
3884
3885	If you call exactMatch() with an empty pattern on an empty string
3886	it will return true; otherwise it returns false since it operates
3887	over the whole string. If you call indexIn() with an empty pattern
3888	on \e any string it will return the start offset (0 by default)
3889	because the empty pattern matches the 'emptiness' at the start of
3890	the string. In this case the length of the match returned by
3891	matchedLength() will be 0.
3892
3893	See QString::isEmpty().
3894	*/
3895
3896	bool QRegExp::isEmpty() const
3897	{
3898	return priv->engineKey.pattern.isEmpty();
3899	}
3900
3901	/*!
3902	Returns true if the regular expression is valid; otherwise returns
3903	false. An invalid regular expression never matches.
3904
3905	The pattern \bold{[a-z} is an example of an invalid pattern, since
3906	it lacks a closing square bracket.
3907
3908	Note that the validity of a regexp may also depend on the setting
3909	of the wildcard flag, for example \bold{*.html} is a valid
3910	wildcard regexp but an invalid full regexp.
3911
3912	\sa errorString()
3913	*/
3914	bool QRegExp::isValid() const
3915	{
3916	if (priv->engineKey.pattern.isEmpty()) {
3917	return true;
3918	} else {
3919	prepareEngine(priv);
3920	return priv->eng->isValid();
3921	}
3922	}
3923
3924	/*!
3925	Returns the pattern string of the regular expression. The pattern
3926	has either regular expression syntax or wildcard syntax, depending
3927	on patternSyntax().
3928
3929	\sa patternSyntax(), caseSensitivity()
3930	*/
3931	QString QRegExp::pattern() const
3932	{
3933	return priv->engineKey.pattern;
3934	}
3935
3936	/*!
3937	Sets the pattern string to \a pattern. The case sensitivity,
3938	wildcard, and minimal matching options are not changed.
3939
3940	\sa setPatternSyntax(), setCaseSensitivity()
3941	*/
3942	void QRegExp::setPattern(const QString &pattern)
3943	{
3944	if (priv->engineKey.pattern != pattern) {
3945	invalidateEngine(priv);
3946	priv->engineKey.pattern = pattern;
3947	}
3948	}
3949
3950	/*!
3951	Returns Qt::CaseSensitive if the regexp is matched case
3952	sensitively; otherwise returns Qt::CaseInsensitive.
3953
3954	\sa patternSyntax(), pattern(), isMinimal()
3955	*/
3956	Qt::CaseSensitivity QRegExp::caseSensitivity() const
3957	{
3958	return priv->engineKey.cs;
3959	}
3960
3961	/*!
3962	Sets case sensitive matching to \a cs.
3963
3964	If \a cs is Qt::CaseSensitive, \bold{\\.txt$} matches
3965	\c{readme.txt} but not \c{README.TXT}.
3966
3967	\sa setPatternSyntax(), setPattern(), setMinimal()
3968	*/
3969	void QRegExp::setCaseSensitivity(Qt::CaseSensitivity cs)
3970	{
3971	if ((bool)cs != (bool)priv->engineKey.cs) {
3972	invalidateEngine(priv);
3973	priv->engineKey.cs = cs;
3974	}
3975	}
3976
3977	/*!
3978	Returns the syntax used by the regular expression. The default is
3979	QRegExp::RegExp.
3980
3981	\sa pattern(), caseSensitivity()
3982	*/
3983	QRegExp::PatternSyntax QRegExp::patternSyntax() const
3984	{
3985	return priv->engineKey.patternSyntax;
3986	}
3987
3988	/*!
3989	Sets the syntax mode for the regular expression. The default is
3990	QRegExp::RegExp.
3991
3992	Setting \a syntax to QRegExp::Wildcard enables simple shell-like
3993	\l{wildcard matching}. For example, \bold{r*.txt} matches the
3994	string \c{readme.txt} in wildcard mode, but does not match
3995	\c{readme}.
3996
3997	Setting \a syntax to QRegExp::FixedString means that the pattern
3998	is interpreted as a plain string. Special characters (e.g.,
3999	backslash) don't need to be escaped then.
4000
4001	\sa setPattern(), setCaseSensitivity(), escape()
4002	*/
4003	void QRegExp::setPatternSyntax(PatternSyntax syntax)
4004	{
4005	if (syntax != priv->engineKey.patternSyntax) {
4006	invalidateEngine(priv);
4007	priv->engineKey.patternSyntax = syntax;
4008	}
4009	}
4010
4011	/*!
4012	Returns true if minimal (non-greedy) matching is enabled;
4013	otherwise returns false.
4014
4015	\sa caseSensitivity(), setMinimal()
4016	*/
4017	bool QRegExp::isMinimal() const
4018	{
4019	return priv->minimal;
4020	}
4021
4022	/*!
4023	Enables or disables minimal matching. If \a minimal is false,
4024	matching is greedy (maximal) which is the default.
4025
4026	For example, suppose we have the input string "We must be
4027	<b>bold</b>, very <b>bold</b>!" and the pattern
4028	\bold{<b>.*</b>}. With the default greedy (maximal) matching,
4029	the match is "We must be \underline{<b>bold</b>, very
4030	<b>bold</b>}!". But with minimal (non-greedy) matching, the
4031	first match is: "We must be \underline{<b>bold</b>}, very
4032	<b>bold</b>!" and the second match is "We must be <b>bold</b>,
4033	very \underline{<b>bold</b>}!". In practice we might use the pattern
4034	\bold{<b>[^<]*\</b>} instead, although this will still fail for
4035	nested tags.
4036
4037	\sa setCaseSensitivity()
4038	*/
4039	void QRegExp::setMinimal(bool minimal)
4040	{
4041	priv->minimal = minimal;
4042	}
4043
4044	// ### Qt 5: make non-const
4045	/*!
4046	Returns true if \a str is matched exactly by this regular
4047	expression; otherwise returns false. You can determine how much of
4048	the string was matched by calling matchedLength().
4049
4050	For a given regexp string R, exactMatch("R") is the equivalent of
4051	indexIn("^R$") since exactMatch() effectively encloses the regexp
4052	in the start of string and end of string anchors, except that it
4053	sets matchedLength() differently.
4054
4055	For example, if the regular expression is \bold{blue}, then
4056	exactMatch() returns true only for input \c blue. For inputs \c
4057	bluebell, \c blutak and \c lightblue, exactMatch() returns false
4058	and matchedLength() will return 4, 3 and 0 respectively.
4059
4060	Although const, this function sets matchedLength(),
4061	capturedTexts(), and pos().
4062
4063	\sa indexIn(), lastIndexIn()
4064	*/
4065	bool QRegExp::exactMatch(const QString &str) const
4066	{
4067	prepareEngineForMatch(priv, str);
4068	priv->matchState.match(str.unicode(), str.length(), 0, priv->minimal, true, 0);
4069	if (priv->matchState.captured[1] == str.length()) {
4070	return true;
4071	} else {
4072	priv->matchState.captured[0] = 0;
4073	priv->matchState.captured[1] = priv->matchState.oneTestMatchedLen;
4074	return false;
4075	}
4076	}
4077
4078	// ### Qt 5: make non-const
4079	/*!
4080	Attempts to find a match in \a str from position \a offset (0 by
4081	default). If \a offset is -1, the search starts at the last
4082	character; if -2, at the next to last character; etc.
4083
4084	Returns the position of the first match, or -1 if there was no
4085	match.
4086
4087	The \a caretMode parameter can be used to instruct whether \bold{^}
4088	should match at index 0 or at \a offset.
4089
4090	You might prefer to use QString::indexOf(), QString::contains(),
4091	or even QStringList::filter(). To replace matches use
4092	QString::replace().
4093
4094	Example:
4095	\snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 13
4096
4097	Although const, this function sets matchedLength(),
4098	capturedTexts() and pos().
4099
4100	If the QRegExp is a wildcard expression (see setPatternSyntax())
4101	and want to test a string against the whole wildcard expression,
4102	use exactMatch() instead of this function.
4103
4104	\sa lastIndexIn(), exactMatch()
4105	*/
4106
4107	int QRegExp::indexIn(const QString &str, int offset, CaretMode caretMode) const
4108	{
4109	prepareEngineForMatch(priv, str);
4110	if (offset < 0)
4111	offset += str.length();
4112	priv->matchState.match(str.unicode(), str.length(), offset,
4113	priv->minimal, false, caretIndex(offset, caretMode));
4114	return priv->matchState.captured[0];
4115	}
4116
4117	// ### Qt 5: make non-const
4118	/*!
4119	Attempts to find a match backwards in \a str from position \a
4120	offset. If \a offset is -1 (the default), the search starts at the
4121	last character; if -2, at the next to last character; etc.
4122
4123	Returns the position of the first match, or -1 if there was no
4124	match.
4125
4126	The \a caretMode parameter can be used to instruct whether \bold{^}
4127	should match at index 0 or at \a offset.
4128
4129	Although const, this function sets matchedLength(),
4130	capturedTexts() and pos().
4131
4132	\warning Searching backwards is much slower than searching
4133	forwards.
4134
4135	\sa indexIn(), exactMatch()
4136	*/
4137
4138	int QRegExp::lastIndexIn(const QString &str, int offset, CaretMode caretMode) const
4139	{
4140	prepareEngineForMatch(priv, str);
4141	if (offset < 0)
4142	offset += str.length();
4143	if (offset < 0 \|\| offset > str.length()) {
4144	memset(priv->matchState.captured, -1, priv->matchState.capturedSize*sizeof(int));
4145	return -1;
4146	}
4147
4148	while (offset >= 0) {
4149	priv->matchState.match(str.unicode(), str.length(), offset,
4150	priv->minimal, true, caretIndex(offset, caretMode));
4151	if (priv->matchState.captured[0] == offset)
4152	return offset;
4153	--offset;
4154	}
4155	return -1;
4156	}
4157
4158	/*!
4159	Returns the length of the last matched string, or -1 if there was
4160	no match.
4161
4162	\sa exactMatch(), indexIn(), lastIndexIn()
4163	*/
4164	int QRegExp::matchedLength() const
4165	{
4166	return priv->matchState.captured[1];
4167	}
4168
4169	#ifndef QT_NO_REGEXP_CAPTURE
4170	/*!
4171	\obsolete
4172	Returns the number of captures contained in the regular expression.
4173
4174	\sa captureCount()
4175	*/
4176	int QRegExp::numCaptures() const
4177	{
4178	return captureCount();
4179	}
4180
4181	/*!
4182	\since 4.6
4183	Returns the number of captures contained in the regular expression.
4184	*/
4185	int QRegExp::captureCount() const
4186	{
4187	prepareEngine(priv);
4188	return priv->eng->captureCount();
4189	}
4190
4191	/*!
4192	Returns a list of the captured text strings.
4193
4194	The first string in the list is the entire matched string. Each
4195	subsequent list element contains a string that matched a
4196	(capturing) subexpression of the regexp.
4197
4198	For example:
4199	\snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 14
4200
4201	The above example also captures elements that may be present but
4202	which we have no interest in. This problem can be solved by using
4203	non-capturing parentheses:
4204
4205	\snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 15
4206
4207	Note that if you want to iterate over the list, you should iterate
4208	over a copy, e.g.
4209	\snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 16
4210
4211	Some regexps can match an indeterminate number of times. For
4212	example if the input string is "Offsets: 12 14 99 231 7" and the
4213	regexp, \c{rx}, is \bold{(\\d+)+}, we would hope to get a list of
4214	all the numbers matched. However, after calling
4215	\c{rx.indexIn(str)}, capturedTexts() will return the list ("12",
4216	"12"), i.e. the entire match was "12" and the first subexpression
4217	matched was "12". The correct approach is to use cap() in a
4218	\l{QRegExp#cap_in_a_loop}{loop}.
4219
4220	The order of elements in the string list is as follows. The first
4221	element is the entire matching string. Each subsequent element
4222	corresponds to the next capturing open left parentheses. Thus
4223	capturedTexts()[1] is the text of the first capturing parentheses,
4224	capturedTexts()[2] is the text of the second and so on
4225	(corresponding to $1, $2, etc., in some other regexp languages).
4226
4227	\sa cap(), pos()
4228	*/
4229	QStringList QRegExp::capturedTexts() const
4230	{
4231	if (priv->capturedCache.isEmpty()) {
4232	prepareEngine(priv);
4233	const int *captured = priv->matchState.captured;
4234	int n = priv->matchState.capturedSize;
4235
4236	for (int i = 0; i < n; i += 2) {
4237	QString m;
4238	if (captured[i + 1] == 0)
4239	m = QLatin1String(""); // ### Qt 5: don't distinguish between null and empty
4240	else if (captured[i] >= 0)
4241	m = priv->t.mid(captured[i], captured[i + 1]);
4242	priv->capturedCache.append(m);
4243	}
4244	priv->t.clear();
4245	}
4246	return priv->capturedCache;
4247	}
4248
4249	/*!
4250	\internal
4251	*/
4252	QStringList QRegExp::capturedTexts()
4253	{
4254	return const_cast<const QRegExp *>(this)->capturedTexts();
4255	}
4256
4257	/*!
4258	Returns the text captured by the \a nth subexpression. The entire
4259	match has index 0 and the parenthesized subexpressions have
4260	indexes starting from 1 (excluding non-capturing parentheses).
4261
4262	\snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 17
4263
4264	The order of elements matched by cap() is as follows. The first
4265	element, cap(0), is the entire matching string. Each subsequent
4266	element corresponds to the next capturing open left parentheses.
4267	Thus cap(1) is the text of the first capturing parentheses, cap(2)
4268	is the text of the second, and so on.
4269
4270	\sa capturedTexts(), pos()
4271	*/
4272	QString QRegExp::cap(int nth) const
4273	{
4274	return capturedTexts().value(nth);
4275	}
4276
4277	/*!
4278	\internal
4279	*/
4280	QString QRegExp::cap(int nth)
4281	{
4282	return const_cast<const QRegExp *>(this)->cap(nth);
4283	}
4284
4285	/*!
4286	Returns the position of the \a nth captured text in the searched
4287	string. If \a nth is 0 (the default), pos() returns the position
4288	of the whole match.
4289
4290	Example:
4291	\snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 18
4292
4293	For zero-length matches, pos() always returns -1. (For example, if
4294	cap(4) would return an empty string, pos(4) returns -1.) This is
4295	a feature of the implementation.
4296
4297	\sa cap(), capturedTexts()
4298	*/
4299	int QRegExp::pos(int nth) const
4300	{
4301	if (nth < 0 \|\| nth >= priv->matchState.capturedSize / 2)
4302	return -1;
4303	else
4304	return priv->matchState.captured[2 * nth];
4305	}
4306
4307	/*!
4308	\internal
4309	*/
4310	int QRegExp::pos(int nth)
4311	{
4312	return const_cast<const QRegExp *>(this)->pos(nth);
4313	}
4314
4315	/*!
4316	Returns a text string that explains why a regexp pattern is
4317	invalid the case being; otherwise returns "no error occurred".
4318
4319	\sa isValid()
4320	*/
4321	QString QRegExp::errorString() const
4322	{
4323	if (isValid()) {
4324	return QString::fromLatin1(RXERR_OK);
4325	} else {
4326	return priv->eng->errorString();
4327	}
4328	}
4329
4330	/*!
4331	\internal
4332	*/
4333	QString QRegExp::errorString()
4334	{
4335	return const_cast<const QRegExp *>(this)->errorString();
4336	}
4337	#endif
4338
4339	/*!
4340	Returns the string \a str with every regexp special character
4341	escaped with a backslash. The special characters are $, (,), *, +,
4342	., ?, [, \,], ^, {, \| and }.
4343
4344	Example:
4345
4346	\snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 19
4347
4348	This function is useful to construct regexp patterns dynamically:
4349
4350	\snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 20
4351
4352	\sa setPatternSyntax()
4353	*/
4354	QString QRegExp::escape(const QString &str)
4355	{
4356	QString quoted;
4357	const int count = str.count();
4358	quoted.reserve(count * 2);
4359	const QLatin1Char backslash('\\');
4360	for (int i = 0; i < count; i++) {
4361	switch (str.at(i).toLatin1()) {
4362	case '$':
4363	case '(':
4364	case ')':
4365	case '*':
4366	case '+':
4367	case '.':
4368	case '?':
4369	case '[':
4370	case '\\':
4371	case ']':
4372	case '^':
4373	case '{':
4374	case '\|':
4375	case '}':
4376	quoted.append(backslash);
4377	}
4378	quoted.append(str.at(i));
4379	}
4380	return quoted;
4381	}
4382
4383	/*!
4384	\fn bool QRegExp::caseSensitive() const
4385
4386	Use \l caseSensitivity() instead.
4387	*/
4388
4389	/*!
4390	\fn void QRegExp::setCaseSensitive(bool sensitive)
4391
4392	Use \l setCaseSensitivity() instead.
4393	*/
4394
4395	/*!
4396	\fn bool QRegExp::wildcard() const
4397
4398	Use \l patternSyntax() instead.
4399
4400	\oldcode
4401	bool wc = rx.wildcard();
4402	\newcode
4403	bool wc = (rx.patternSyntax() == QRegExp::Wildcard);
4404	\endcode
4405	*/
4406
4407	/*!
4408	\fn void QRegExp::setWildcard(bool wildcard)
4409
4410	Use \l setPatternSyntax() instead.
4411
4412	\oldcode
4413	rx.setWildcard(wc);
4414	\newcode
4415	rx.setPatternSyntax(wc ? QRegExp::Wildcard : QRegExp::RegExp);
4416	\endcode
4417	*/
4418
4419	/*!
4420	\fn bool QRegExp::minimal() const
4421
4422	Use \l isMinimal() instead.
4423	*/
4424
4425	/*!
4426	\fn int QRegExp::search(const QString &str, int from = 0,
4427	CaretMode caretMode = CaretAtZero) const
4428
4429	Use \l indexIn() instead.
4430	*/
4431
4432	/*!
4433	\fn int QRegExp::searchRev(const QString &str, int from = -1, \
4434	CaretMode caretMode = CaretAtZero) const
4435
4436	Use \l lastIndexIn() instead.
4437	*/
4438
4439	/*!
4440	\fn QRegExp::QRegExp(const QString &pattern, bool cs, bool wildcard = false)
4441
4442	Use another constructor instead.
4443
4444	\oldcode
4445	QRegExp rx("*.txt", false, true);
4446	\newcode
4447	QRegExp rx("*.txt", Qt::CaseInsensitive, QRegExp::Wildcard);
4448	\endcode
4449	*/
4450
4451	#ifndef QT_NO_DATASTREAM
4452	/*!
4453	\relates QRegExp
4454
4455	Writes the regular expression \a regExp to stream \a out.
4456
4457	\sa {Format of the QDataStream Operators}
4458	*/
4459	QDataStream &operator<<(QDataStream &out, const QRegExp &regExp)
4460	{
4461	return out << regExp.pattern() << (quint8)regExp.caseSensitivity()
4462	<< (quint8)regExp.patternSyntax()
4463	<< (quint8)!!regExp.isMinimal();
4464	}
4465
4466	/*!
4467	\relates QRegExp
4468
4469	Reads a regular expression from stream \a in into \a regExp.
4470
4471	\sa {Format of the QDataStream Operators}
4472	*/
4473	QDataStream &operator>>(QDataStream &in, QRegExp &regExp)
4474	{
4475	QString pattern;
4476	quint8 cs;
4477	quint8 patternSyntax;
4478	quint8 isMinimal;
4479
4480	in >> pattern >> cs >> patternSyntax >> isMinimal;
4481
4482	QRegExp newRegExp(pattern, Qt::CaseSensitivity(cs),
4483	QRegExp::PatternSyntax(patternSyntax));
4484
4485	newRegExp.setMinimal(isMinimal);
4486	regExp = newRegExp;
4487	return in;
4488	}
4489	#endif // QT_NO_DATASTREAM
4490
4491	QT_END_NAMESPACE

Note: See TracBrowser for help on using the repository browser.

Download in other formats: