source: trunk/essentials/dev-lang/perl/regcomp.h@ 3280

Last change on this file since 3280 was 3181, checked in by bird, 19 years ago

perl 5.8.8

File size: 13.0 KB
Line 
1/* regcomp.h
2 *
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
4 * 2000, 2001, 2002, 2003, 2005 by Larry Wall and others
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11typedef OP OP_4tree; /* Will be redefined later. */
12
13/*
14 * The "internal use only" fields in regexp.h are present to pass info from
15 * compile to execute that permits the execute phase to run lots faster on
16 * simple cases. They are:
17 *
18 * regstart sv that must begin a match; Nullch if none obvious
19 * reganch is the match anchored (at beginning-of-line only)?
20 * regmust string (pointer into program) that match must include, or NULL
21 * [regmust changed to SV* for bminstr()--law]
22 * regmlen length of regmust string
23 * [regmlen not used currently]
24 *
25 * Regstart and reganch permit very fast decisions on suitable starting points
26 * for a match, cutting down the work a lot. Regmust permits fast rejection
27 * of lines that cannot possibly match. The regmust tests are costly enough
28 * that pregcomp() supplies a regmust only if the r.e. contains something
29 * potentially expensive (at present, the only such thing detected is * or +
30 * at the start of the r.e., which can involve a lot of backup). Regmlen is
31 * supplied because the test in pregexec() needs it and pregcomp() is computing
32 * it anyway.
33 * [regmust is now supplied always. The tests that use regmust have a
34 * heuristic that disables the test if it usually matches.]
35 *
36 * [In fact, we now use regmust in many cases to locate where the search
37 * starts in the string, so if regback is >= 0, the regmust search is never
38 * wasted effort. The regback variable says how many characters back from
39 * where regmust matched is the earliest possible start of the match.
40 * For instance, /[a-z].foo/ has a regmust of 'foo' and a regback of 2.]
41 */
42
43/*
44 * Structure for regexp "program". This is essentially a linear encoding
45 * of a nondeterministic finite-state machine (aka syntax charts or
46 * "railroad normal form" in parsing technology). Each node is an opcode
47 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
48 * all nodes except BRANCH implement concatenation; a "next" pointer with
49 * a BRANCH on both ends of it is connecting two alternatives. (Here we
50 * have one of the subtle syntax dependencies: an individual BRANCH (as
51 * opposed to a collection of them) is never concatenated with anything
52 * because of operator precedence.) The operand of some types of node is
53 * a literal string; for others, it is a node leading into a sub-FSM. In
54 * particular, the operand of a BRANCH node is the first node of the branch.
55 * (NB this is *not* a tree structure: the tail of the branch connects
56 * to the thing following the set of BRANCHes.) The opcodes are:
57 */
58
59/*
60 * A node is one char of opcode followed by two chars of "next" pointer.
61 * "Next" pointers are stored as two 8-bit pieces, high order first. The
62 * value is a positive offset from the opcode of the node containing it.
63 * An operand, if any, simply follows the node. (Note that much of the
64 * code generation knows about this implicit relationship.)
65 *
66 * Using two bytes for the "next" pointer is vast overkill for most things,
67 * but allows patterns to get big without disasters.
68 *
69 * [The "next" pointer is always aligned on an even
70 * boundary, and reads the offset directly as a short. Also, there is no
71 * special test to reverse the sign of BACK pointers since the offset is
72 * stored negative.]
73 */
74
75struct regnode_string {
76 U8 str_len;
77 U8 type;
78 U16 next_off;
79 char string[1];
80};
81
82struct regnode_1 {
83 U8 flags;
84 U8 type;
85 U16 next_off;
86 U32 arg1;
87};
88
89struct regnode_2 {
90 U8 flags;
91 U8 type;
92 U16 next_off;
93 U16 arg1;
94 U16 arg2;
95};
96
97#define ANYOF_BITMAP_SIZE 32 /* 256 b/(8 b/B) */
98#define ANYOF_CLASSBITMAP_SIZE 4 /* up to 32 (8*4) named classes */
99
100struct regnode_charclass {
101 U8 flags;
102 U8 type;
103 U16 next_off;
104 U32 arg1;
105 char bitmap[ANYOF_BITMAP_SIZE]; /* only compile-time */
106};
107
108struct regnode_charclass_class { /* has [[:blah:]] classes */
109 U8 flags; /* should have ANYOF_CLASS here */
110 U8 type;
111 U16 next_off;
112 U32 arg1;
113 char bitmap[ANYOF_BITMAP_SIZE]; /* both compile-time */
114 char classflags[ANYOF_CLASSBITMAP_SIZE]; /* and run-time */
115};
116
117/* XXX fix this description.
118 Impose a limit of REG_INFTY on various pattern matching operations
119 to limit stack growth and to avoid "infinite" recursions.
120*/
121/* The default size for REG_INFTY is I16_MAX, which is the same as
122 SHORT_MAX (see perl.h). Unfortunately I16 isn't necessarily 16 bits