Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

ParseWords.pm@ 3299

Visit:

Last change on this file since 3299 was 3181, checked in by bird, 19 years ago
perl 5.8.8
File size: 6.5 KB

Line
1	package Text::ParseWords;
2
3	use vars qw($VERSION @ISA @EXPORT $PERL_SINGLE_QUOTE);
4	$VERSION = "3.24";
5
6	require 5.000;
7
8	use Exporter;
9	@ISA = qw(Exporter);
10	@EXPORT = qw(shellwords quotewords nested_quotewords parse_line);
11	@EXPORT_OK = qw(old_shellwords);
12
13
14	sub shellwords {
15	my(@lines) = @_;
16	$lines[$#lines] =~ s/\s+$//;
17	return(quotewords('\s+', 0, @lines));
18	}
19
20
21
22	sub quotewords {
23	my($delim, $keep, @lines) = @_;
24	my($line, @words, @allwords);
25
26	foreach $line (@lines) {
27	@words = parse_line($delim, $keep, $line);
28	return() unless (@words \|\| !length($line));
29	push(@allwords, @words);
30	}
31	return(@allwords);
32	}
33
34
35
36	sub nested_quotewords {
37	my($delim, $keep, @lines) = @_;
38	my($i, @allwords);
39
40	for ($i = 0; $i < @lines; $i++) {
41	@{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]);
42	return() unless (@{$allwords[$i]} \|\| !length($lines[$i]));
43	}
44	return(@allwords);
45	}
46
47
48
49	sub parse_line {
50	my($delimiter, $keep, $line) = @_;
51	my($word, @pieces);
52
53	no warnings 'uninitialized'; # we will be testing undef strings
54
55	while (length($line)) {
56	$line =~ s/^(["']) # a $quote
57	((?:\\.\|(?!\1)[^\\])*) # and $quoted text
58	\1 # followed by the same quote
59	\| # --OR--
60	^((?:\\.\|[^\\"'])*?) # an $unquoted text
61	(\Z(?!\n)\|(?-x:$delimiter)\|(?!^)(?=["']))
62	# plus EOL, delimiter, or quote
63	//xs or return; # extended layout
64	my($quote, $quoted, $unquoted, $delim) = ($1, $2, $3, $4);
65	return() unless( defined($quote) \|\| length($unquoted) \|\| length($delim));
66
67	if ($keep) {
68	$quoted = "$quote$quoted$quote";
69	}
70	else {
71	$unquoted =~ s/\\(.)/$1/sg;
72	if (defined $quote) {
73	$quoted =~ s/\\(.)/$1/sg if ($quote eq '"');
74	$quoted =~ s/\\([\\'])/$1/g if ( $PERL_SINGLE_QUOTE && $quote eq "'");
75	}
76	}
77	$word .= substr($line, 0, 0); # leave results tainted
78	$word .= defined $quote ? $quoted : $unquoted;
79
80	if (length($delim)) {
81	push(@pieces, $word);
82	push(@pieces, $delim) if ($keep eq 'delimiters');
83	undef $word;
84	}
85	if (!length($line)) {
86	push(@pieces, $word);
87	}
88	}
89	return(@pieces);
90	}
91
92
93
94	sub old_shellwords {
95
96	# Usage:
97	# use ParseWords;
98	# @words = old_shellwords($line);
99	# or
100	# @words = old_shellwords(@lines);
101	# or
102	# @words = old_shellwords(); # defaults to $_ (and clobbers it)
103
104	no warnings 'uninitialized'; # we will be testing undef strings
105	local *_ = \join('', @_) if @_;
106	my (@words, $snippet);
107
108	s/\A\s+//;
109	while ($_ ne '') {
110	my $field = substr($_, 0, 0); # leave results tainted
111	for (;;) {
112	if (s/\A"(([^"\\]\|\\.)*)"//s) {
113	($snippet = $1) =~ s#\\(.)#$1#sg;
114	}
115	elsif (/\A"/) {
116	require Carp;
117	Carp::carp("Unmatched double quote: $_");
118	return();
119	}
120	elsif (s/\A'(([^'\\]\|\\.)*)'//s) {
121	($snippet = $1) =~ s#\\(.)#$1#sg;
122	}
123	elsif (/\A'/) {
124	require Carp;
125	Carp::carp("Unmatched single quote: $_");
126	return();
127	}
128	elsif (s/\A\\(.)//s) {
129	$snippet = $1;
130	}
131	elsif (s/\A([^\s\\'"]+)//) {
132	$snippet = $1;
133	}
134	else {
135	s/\A\s+//;
136	last;
137	}
138	$field .= $snippet;
139	}
140	push(@words, $field);
141	}
142	return @words;
143	}
144
145	1;
146
147	__END__
148
149	=head1 NAME
150
151	Text::ParseWords - parse text into an array of tokens or array of arrays
152
153	=head1 SYNOPSIS
154
155	use Text::ParseWords;
156	@lists = &nested_quotewords($delim, $keep, @lines);
157	@words = &quotewords($delim, $keep, @lines);
158	@words = &shellwords(@lines);
159	@words = &parse_line($delim, $keep, $line);
160	@words = &old_shellwords(@lines); # DEPRECATED!
161
162	=head1 DESCRIPTION
163
164	The &nested_quotewords() and &quotewords() functions accept a delimiter
165	(which can be a regular expression)
166	and a list of lines and then breaks those lines up into a list of
167	words ignoring delimiters that appear inside quotes. &quotewords()
168	returns all of the tokens in a single long list, while &nested_quotewords()
169	returns a list of token lists corresponding to the elements of @lines.
170	&parse_line() does tokenizing on a single string. The &*quotewords()
171	functions simply call &parse_line(), so if you're only splitting
172	one line you can call &parse_line() directly and save a function
173	call.
174
175	The $keep argument is a boolean flag. If true, then the tokens are
176	split on the specified delimiter, but all other characters (quotes,
177	backslashes, etc.) are kept in the tokens. If $keep is false then the
178	&*quotewords() functions remove all quotes and backslashes that are
179	not themselves backslash-escaped or inside of single quotes (i.e.,
180	&quotewords() tries to interpret these characters just like the Bourne
181	shell). NB: these semantics are significantly different from the
182	original version of this module shipped with Perl 5.000 through 5.004.
183	As an additional feature, $keep may be the keyword "delimiters" which
184	causes the functions to preserve the delimiters in each string as
185	tokens in the token lists, in addition to preserving quote and
186	backslash characters.
187
188	&shellwords() is written as a special case of &quotewords(), and it
189	does token parsing with whitespace as a delimiter-- similar to most
190	Unix shells.
191
192	=head1 EXAMPLES
193
194	The sample program:
195
196	use Text::ParseWords;
197	@words = &quotewords('\s+', 0, q{this is "a test" of\ quotewords \"for you});
198	$i = 0;
199	foreach (@words) {
200	print "$i: <$_>\n";
201	$i++;
202	}
203
204	produces:
205
206	0: <this>
207	1: <is>
208	2: <a test>
209	3: <of quotewords>
210	4: <"for>
211	5: <you>
212
213	demonstrating:
214
215	=over 4
216
217	=item 0
218
219	a simple word
220
221	=item 1
222
223	multiple spaces are skipped because of our $delim
224
225	=item 2
226
227	use of quotes to include a space in a word
228
229	=item 3
230
231	use of a backslash to include a space in a word
232
233	=item 4
234
235	use of a backslash to remove the special meaning of a double-quote
236
237	=item 5
238
239	another simple word (note the lack of effect of the
240	backslashed double-quote)
241
242	=back
243
244	Replacing C<&quotewords('\s+', 0, q{this is...})>
245	with C<&shellwords(q{this is...})>
246	is a simpler way to accomplish the same thing.
247
248	=head1 AUTHORS
249
250	Maintainer is Hal Pomeranz <[email protected]>, 1994-1997 (Original
251	author unknown). Much of the code for &parse_line() (including the
252	primary regexp) from Joerk Behrends <[email protected]>.
253
254	Examples section another documentation provided by John Heidemann
255	<[email protected]>
256
257	Bug reports, patches, and nagging provided by lots of folks-- thanks
258	everybody! Special thanks to Michael Schwern <[email protected]>
259	for assuring me that a &nested_quotewords() would be useful, and to
260	Jeff Friedl <[email protected]> for telling me not to worry about
261	error-checking (sort of-- you had to be there).
262
263	=cut

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/essentials/dev-lang/perl/lib/Text/ParseWords.pm@ 3299

Download in other formats: