Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

charnames.pm@ 3298

Visit:

Last change on this file since 3298 was 3181, checked in by bird, 19 years ago
perl 5.8.8
File size: 15.1 KB

Line
1	package charnames;
2	use strict;
3	use warnings;
4	use Carp;
5	use File::Spec;
6	our $VERSION = '1.05';
7
8	use bytes (); # for $bytes::hint_bits
9	$charnames::hint_bits = 0x20000; # HINT_LOCALIZE_HH
10
11	my %alias1 = (
12	# Icky 3.2 names with parentheses.
13	'LINE FEED' => 'LINE FEED (LF)',
14	'FORM FEED' => 'FORM FEED (FF)',
15	'CARRIAGE RETURN' => 'CARRIAGE RETURN (CR)',
16	'NEXT LINE' => 'NEXT LINE (NEL)',
17	# Convenience.
18	'LF' => 'LINE FEED (LF)',
19	'FF' => 'FORM FEED (FF)',
20	'CR' => 'CARRIAGE RETURN (CR)',
21	'NEL' => 'NEXT LINE (NEL)',
22	# More convenience. For futher convencience,
23	# it is suggested some way using using the NamesList
24	# aliases is implemented.
25	'ZWNJ' => 'ZERO WIDTH NON-JOINER',
26	'ZWJ' => 'ZERO WIDTH JOINER',
27	'BOM' => 'BYTE ORDER MARK',
28	);
29
30	my %alias2 = (
31	# Pre-3.2 compatibility (only for the first 256 characters).
32	'HORIZONTAL TABULATION' => 'CHARACTER TABULATION',
33	'VERTICAL TABULATION' => 'LINE TABULATION',
34	'FILE SEPARATOR' => 'INFORMATION SEPARATOR FOUR',
35	'GROUP SEPARATOR' => 'INFORMATION SEPARATOR THREE',
36	'RECORD SEPARATOR' => 'INFORMATION SEPARATOR TWO',
37	'UNIT SEPARATOR' => 'INFORMATION SEPARATOR ONE',
38	'PARTIAL LINE DOWN' => 'PARTIAL LINE FORWARD',
39	'PARTIAL LINE UP' => 'PARTIAL LINE BACKWARD',
40	);
41
42	my %alias3 = (
43	# User defined aliasses. Even more convenient :)
44	);
45	my $txt;
46
47	sub alias (@)
48	{
49	@_ or return %alias3;
50	my $alias = ref $_[0] ? $_[0] : { @_ };
51	@alias3{keys %$alias} = values %$alias;
52	} # alias
53
54	sub alias_file ($)
55	{
56	my ($arg, $file) = @_;
57	if (-f $arg && File::Spec->file_name_is_absolute ($arg)) {
58	$file = $arg;
59	}
60	elsif ($arg =~ m/^\w+$/) {
61	$file = "unicore/${arg}_alias.pl";
62	}
63	else {
64	croak "Charnames alias files can only have identifier characters";
65	}
66	if (my @alias = do $file) {
67	@alias == 1 && !defined $alias[0] and
68	croak "$file cannot be used as alias file for charnames";
69	@alias % 2 and
70	croak "$file did not return a (valid) list of alias pairs";
71	alias (@alias);
72	return (1);
73	}
74	0;
75	} # alias_file
76
77	# This is not optimized in any way yet
78	sub charnames
79	{
80	my $name = shift;
81
82	if (exists $alias1{$name}) {
83	$name = $alias1{$name};
84	}
85	elsif (exists $alias2{$name}) {
86	require warnings;
87	warnings::warnif('deprecated', qq{Unicode character name "$name" is deprecated, use "$alias2{$name}" instead});
88	$name = $alias2{$name};
89	}
90	elsif (exists $alias3{$name}) {
91	$name = $alias3{$name};
92	}
93
94	my $ord;
95	my @off;
96	my $fname;
97
98	if ($name eq "BYTE ORDER MARK") {
99	$fname = $name;
100	$ord = 0xFEFF;
101	} else {
102	## Suck in the code/name list as a big string.
103	## Lines look like:
104	## "0052\t\tLATIN CAPITAL LETTER R\n"
105	$txt = do "unicore/Name.pl" unless $txt;
106
107	## @off will hold the index into the code/name string of the start and
108	## end of the name as we find it.
109
110	## If :full, look for the name exactly
111	if ($^H{charnames_full} and $txt =~ /\t\t\Q$name\E$/m) {
112	@off = ($-[0], $+[0]);
113	}
114
115	## If we didn't get above, and :short allowed, look for the short name.
116	## The short name is like "greek:Sigma"
117	unless (@off) {
118	if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) {
119	my ($script, $cname) = ($1, $2);
120	my $case = $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
121	if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U\Q$cname\E$/m) {
122	@off = ($-[0], $+[0]);
123	}
124	}
125	}
126
127	## If we still don't have it, check for the name among the loaded
128	## scripts.
129	if (not @off) {
130	my $case = $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
131	for my $script (@{$^H{charnames_scripts}}) {
132	if ($txt =~ m/\t\t$script (?:$case )?LETTER \U\Q$name\E$/m) {
133	@off = ($-[0], $+[0]);
134	last;
135	}
136	}
137	}
138
139	## If we don't have it by now, give up.
140	unless (@off) {
141	carp "Unknown charname '$name'";
142	return "\x{FFFD}";
143	}
144
145	##
146	## Now know where in the string the name starts.
147	## The code, in hex, is before that.
148	##
149	## The code can be 4-6 characters long, so we've got to sort of
150	## go look for it, just after the newline that comes before $off[0].
151	##
152	## This would be much easier if unicore/Name.pl had info in
153	## a name/code order, instead of code/name order.
154	##
155	## The +1 after the rindex() is to skip past the newline we're finding,
156	## or, if the rindex() fails, to put us to an offset of zero.
157	##
158	my $hexstart = rindex($txt, "\n", $off[0]) + 1;
159
160	## we know where it starts, so turn into number -
161	## the ordinal for the char.
162	$ord = hex substr($txt, $hexstart, $off[0] - $hexstart);
163	}
164
165	if ($^H & $bytes::hint_bits) { # "use bytes" in effect?
166	use bytes;
167	return chr $ord if $ord <= 255;
168	my $hex = sprintf "%04x", $ord;
169	if (not defined $fname) {
170	$fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
171	}
172	croak "Character 0x$hex with name '$fname' is above 0xFF";
173	}
174
175	no warnings 'utf8'; # allow even illegal characters
176	return pack "U", $ord;
177	} # charnames
178
179	sub import
180	{
181	shift; ## ignore class name
182
183	if (not @_) {
184	carp("`use charnames' needs explicit imports list");
185	}
186	$^H \|= $charnames::hint_bits;
187	$^H{charnames} = \&charnames ;
188
189	##
190	## fill %h keys with our @_ args.
191	##
192	my ($promote, %h, @args) = (0);
193	while (my $arg = shift) {
194	if ($arg eq ":alias") {
195	@_ or
196	croak ":alias needs an argument in charnames";
197	my $alias = shift;
198	if (ref $alias) {
199	ref $alias eq "HASH" or
200	croak "Only HASH reference supported as argument to :alias";
201	alias ($alias);
202	next;
203	}
204	if ($alias =~ m{:(\w+)$}) {
205	$1 eq "full" \|\| $1 eq "short" and
206	croak ":alias cannot use existing pragma :$1 (reversed order?)";
207	alias_file ($1) and $promote = 1;
208	next;
209	}
210	alias_file ($alias);
211	next;
212	}
213	if (substr($arg, 0, 1) eq ':' and ! ($arg eq ":full" \|\| $arg eq ":short")) {
214	warn "unsupported special '$arg' in charnames";
215	next;
216	}
217	push @args, $arg;
218	}
219	@args == 0 && $promote and @args = (":full");
220	@h{@args} = (1) x @args;
221
222	$^H{charnames_full} = delete $h{':full'};
223	$^H{charnames_short} = delete $h{':short'};
224	$^H{charnames_scripts} = [map uc, keys %h];
225
226	##
227	## If utf8? warnings are enabled, and some scripts were given,
228	## see if at least we can find one letter of each script.
229	##
230	if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) {
231	$txt = do "unicore/Name.pl" unless $txt;
232
233	for my $script (@{$^H{charnames_scripts}}) {
234	if (not $txt =~ m/\t\t$script (?:CAPITAL \|SMALL )?LETTER /) {
235	warnings::warn('utf8', "No such script: '$script'");
236	}
237	}
238	}
239	} # import
240
241	my %viacode;
242
243	sub viacode
244	{
245	if (@_ != 1) {
246	carp "charnames::viacode() expects one argument";
247	return;
248	}
249
250	my $arg = shift;
251
252	# this comes actually from Unicode::UCD, where it is the named
253	# function _getcode (), but it avoids the overhead of loading it
254	my $hex;
255	if ($arg =~ /^[1-9]\d*$/) {
256	$hex = sprintf "%04X", $arg;
257	} elsif ($arg =~ /^(?:[Uu]\+\|0[xX])?([[:xdigit:]]+)$/) {
258	$hex = $1;
259	} else {
260	carp("unexpected arg \"$arg\" to charnames::viacode()");
261	return;
262	}
263
264	# checking the length first is slightly faster
265	if (length($hex) > 5 && hex($hex) > 0x10FFFF) {
266	carp sprintf "Unicode characters only allocated up to U+10FFFF (you asked for U+%X)", $hex;
267	return;
268	}
269
270	return $viacode{$hex} if exists $viacode{$hex};
271
272	$txt = do "unicore/Name.pl" unless $txt;
273
274	return unless $txt =~ m/^$hex\t\t(.+)/m;
275
276	$viacode{$hex} = $1;
277	} # viacode
278
279	my %vianame;
280
281	sub vianame
282	{
283	if (@_ != 1) {
284	carp "charnames::vianame() expects one name argument";
285	return ()
286	}
287
288	my $arg = shift;
289
290	return chr hex $1 if $arg =~ /^U\+([0-9a-fA-F]+)$/;
291
292	return $vianame{$arg} if exists $vianame{$arg};
293
294	$txt = do "unicore/Name.pl" unless $txt;
295
296	my $pos = index $txt, "\t\t$arg\n";
297	if ($[ <= $pos) {
298	my $posLF = rindex $txt, "\n", $pos;
299	(my $code = substr $txt, $posLF + 1, 6) =~ tr/\t//d;
300	return $vianame{$arg} = hex $code;
301
302	# If $pos is at the 1st line, $posLF must be $[ - 1 (not found);
303	# then $posLF + 1 equals to $[ (at the beginning of $txt).
304	# Otherwise $posLF is the position of "\n";
305	# then $posLF + 1 must be the position of the next to "\n"
306	# (the beginning of the line).
307	# substr($txt, $posLF + 1, 6) may be "0000\t\t", "00A1\t\t",
308	# "10300\t", "100000", etc. So we can get the code via removing TAB.
309	} else {
310	return;
311	}
312	} # vianame
313
314
315	1;
316	__END__
317
318	=head1 NAME
319
320	charnames - define character names for C<\N{named}> string literal escapes
321
322	=head1 SYNOPSIS
323
324	use charnames ':full';
325	print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
326
327	use charnames ':short';
328	print "\N{greek:Sigma} is an upper-case sigma.\n";
329
330	use charnames qw(cyrillic greek);
331	print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
332
333	use charnames ":full", ":alias" => {
334	e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
335	};
336	print "\N{e_ACUTE} is a small letter e with an acute.\n";
337
338	use charnames ();
339	print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
340	printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints "10330"
341
342	=head1 DESCRIPTION
343
344	Pragma C<use charnames> supports arguments C<:full>, C<:short>, script
345	names and customized aliases. If C<:full> is present, for expansion of
346	C<\N{CHARNAME}>, the string C<CHARNAME> is first looked up in the list of
347	standard Unicode character names. If C<:short> is present, and
348	C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
349	as a letter in script C<SCRIPT>. If pragma C<use charnames> is used
350	with script name arguments, then for C<\N{CHARNAME}> the name
351	C<CHARNAME> is looked up as a letter in the given scripts (in the
352	specified order). Customized aliases are explained in L</CUSTOM ALIASES>.
353
354	For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
355	this pragma looks for the names
356
357	SCRIPTNAME CAPITAL LETTER CHARNAME
358	SCRIPTNAME SMALL LETTER CHARNAME
359	SCRIPTNAME LETTER CHARNAME
360
361	in the table of standard Unicode names. If C<CHARNAME> is lowercase,
362	then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant
363	is ignored.
364
365	Note that C<\N{...}> is compile-time, it's a special form of string
366	constant used inside double-quoted strings: in other words, you cannot
367	use variables inside the C<\N{...}>. If you want similar run-time
368	functionality, use charnames::vianame().
369
370	For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F)
371	as of Unicode 3.1, there are no official Unicode names but you can use
372	instead the ISO 6429 names (LINE FEED, ESCAPE, and so forth). In
373	Unicode 3.2 (as of Perl 5.8) some naming changes take place ISO 6429
374	has been updated, see L</ALIASES>. Also note that the U+UU80, U+0081,
375	U+0084, and U+0099 do not have names even in ISO 6429.
376
377	Since the Unicode standard uses "U+HHHH", so can you: "\N{U+263a}"
378	is the Unicode smiley face, or "\N{WHITE SMILING FACE}".
379
380	=head1 CUSTOM TRANSLATORS
381
382	The mechanism of translation of C<\N{...}> escapes is general and not
383	hardwired into F<charnames.pm>. A module can install custom
384	translations (inside the scope which C<use>s the module) with the
385	following magic incantation:
386
387	use charnames (); # for $charnames::hint_bits
388	sub import {
389	shift;
390	$^H \|= $charnames::hint_bits;
391	$^H{charnames} = \&translator;
392	}
393
394	Here translator() is a subroutine which takes C<CHARNAME> as an
395	argument, and returns text to insert into the string instead of the
396	C<\N{CHARNAME}> escape. Since the text to insert should be different
397	in C<bytes> mode and out of it, the function should check the current
398	state of C<bytes>-flag as in:
399
400	use bytes (); # for $bytes::hint_bits
401	sub translator {
402	if ($^H & $bytes::hint_bits) {
403	return bytes_translator(@_);
404	}
405	else {
406	return utf8_translator(@_);
407	}
408	}
409
410	=head1 CUSTOM ALIASES
411
412	This version of charnames supports three mechanisms of adding local
413	or customized aliases to standard Unicode naming conventions (:full)
414
415	=head2 Anonymous hashes
416
417	use charnames ":full", ":alias" => {
418	e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
419	};
420	my $str = "\N{e_ACUTE}";
421
422	=head2 Alias file
423
424	use charnames ":full", ":alias" => "pro";
425
426	will try to read "unicore/pro_alias.pl" from the @INC path. This
427	file should return a list in plain perl:
428
429	(
430	A_GRAVE => "LATIN CAPITAL LETTER A WITH GRAVE",
431	A_CIRCUM => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX",
432	A_DIAERES => "LATIN CAPITAL LETTER A WITH DIAERESIS",
433	A_TILDE => "LATIN CAPITAL LETTER A WITH TILDE",
434	A_BREVE => "LATIN CAPITAL LETTER A WITH BREVE",
435	A_RING => "LATIN CAPITAL LETTER A WITH RING ABOVE",
436	A_MACRON => "LATIN CAPITAL LETTER A WITH MACRON",
437	);
438
439	=head2 Alias shortcut
440
441	use charnames ":alias" => ":pro";
442
443	works exactly the same as the alias pairs, only this time,
444	":full" is inserted automatically as first argument (if no
445	other argument is given).
446
447	=head1 charnames::viacode(code)
448
449	Returns the full name of the character indicated by the numeric code.
450	The example
451
452	print charnames::viacode(0x2722);
453
454	prints "FOUR TEARDROP-SPOKED ASTERISK".
455
456	Returns undef if no name is known for the code.
457
458	This works only for the standard names, and does not yet apply
459	to custom translators.
460
461	Notice that the name returned for of U+FEFF is "ZERO WIDTH NO-BREAK
462	SPACE", not "BYTE ORDER MARK".
463
464	=head1 charnames::vianame(name)
465
466	Returns the code point indicated by the name.
467	The example
468
469	printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");
470
471	prints "2722".
472
473	Returns undef if the name is unknown.
474
475	This works only for the standard names, and does not yet apply
476	to custom translators.
477
478	=head1 ALIASES
479
480	A few aliases have been defined for convenience: instead of having
481	to use the official names
482
483	LINE FEED (LF)
484	FORM FEED (FF)
485	CARRIAGE RETURN (CR)
486	NEXT LINE (NEL)
487
488	(yes, with parentheses) one can use
489
490	LINE FEED
491	FORM FEED
492	CARRIAGE RETURN
493	NEXT LINE
494	LF
495	FF
496	CR
497	NEL
498
499	One can also use
500
501	BYTE ORDER MARK
502	BOM
503
504	and
505
506	ZWNJ
507	ZWJ
508
509	for ZERO WIDTH NON-JOINER and ZERO WIDTH JOINER.
510
511	For backward compatibility one can use the old names for
512	certain C0 and C1 controls
513
514	old new
515
516	HORIZONTAL TABULATION CHARACTER TABULATION
517	VERTICAL TABULATION LINE TABULATION
518	FILE SEPARATOR INFORMATION SEPARATOR FOUR
519	GROUP SEPARATOR INFORMATION SEPARATOR THREE
520	RECORD SEPARATOR INFORMATION SEPARATOR TWO
521	UNIT SEPARATOR INFORMATION SEPARATOR ONE
522	PARTIAL LINE DOWN PARTIAL LINE FORWARD
523	PARTIAL LINE UP PARTIAL LINE BACKWARD
524
525	but the old names in addition to giving the character
526	will also give a warning about being deprecated.
527
528	=head1 ILLEGAL CHARACTERS
529
530	If you ask by name for a character that does not exist, a warning is
531	given and the Unicode I<replacement character> "\x{FFFD}" is returned.
532
533	If you ask by code for a character that does not exist, no warning is
534	given and C<undef> is returned. (Though if you ask for a code point
535	past U+10FFFF you do get a warning.)
536
537	=head1 BUGS
538
539	Since evaluation of the translation function happens in a middle of
540	compilation (of a string literal), the translation function should not
541	do any C<eval>s or C<require>s. This restriction should be lifted in
542	a future version of Perl.
543
544	=cut

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/essentials/dev-lang/perl/lib/charnames.pm@ 3298

Download in other formats: