Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

charnames.pm@ 3368

Visit:

Last change on this file since 3368 was 3181, checked in by bird, 19 years ago
perl 5.8.8
File size: 15.1 KB

Line
1	package charnames;
2	use strict;
3	use warnings;
4	use Carp;
5	use File::Spec;
6	our $VERSION = '1.05';
7
8	use bytes (); # for $bytes::hint_bits
9	$charnames::hint_bits = 0x20000; # HINT_LOCALIZE_HH
10
11	my %alias1 = (
12	# Icky 3.2 names with parentheses.
13	'LINE FEED' => 'LINE FEED (LF)',
14	'FORM FEED' => 'FORM FEED (FF)',
15	'CARRIAGE RETURN' => 'CARRIAGE RETURN (CR)',
16	'NEXT LINE' => 'NEXT LINE (NEL)',
17	# Convenience.
18	'LF' => 'LINE FEED (LF)',
19	'FF' => 'FORM FEED (FF)',
20	'CR' => 'CARRIAGE RETURN (CR)',
21	'NEL' => 'NEXT LINE (NEL)',
22	# More convenience. For futher convencience,
23	# it is suggested some way using using the NamesList
24	# aliases is implemented.
25	'ZWNJ' => 'ZERO WIDTH NON-JOINER',
26	'ZWJ' => 'ZERO WIDTH JOINER',
27	'BOM' => 'BYTE ORDER MARK',
28	);
29
30	my %alias2 = (
31	# Pre-3.2 compatibility (only for the first 256 characters).
32	'HORIZONTAL TABULATION' => 'CHARACTER TABULATION',
33	'VERTICAL TABULATION' => 'LINE TABULATION',
34	'FILE SEPARATOR' => 'INFORMATION SEPARATOR FOUR',
35	'GROUP SEPARATOR' => 'INFORMATION SEPARATOR THREE',
36	'RECORD SEPARATOR' => 'INFORMATION SEPARATOR TWO',
37	'UNIT SEPARATOR' => 'INFORMATION SEPARATOR ONE',
38	'PARTIAL LINE DOWN' => 'PARTIAL LINE FORWARD',
39	'PARTIAL LINE UP' => 'PARTIAL LINE BACKWARD',
40	);
41
42	my %alias3 = (
43	# User defined aliasses. Even more convenient :)
44	);
45	my $txt;
46
47	sub alias (@)
48	{
49	@_ or return %alias3;
50	my $alias = ref $_[0] ? $_[0] : { @_ };
51	@alias3{keys %$alias} = values %$alias;
52	} # alias
53
54	sub alias_file ($)
55	{
56	my ($arg, $file) = @_;
57	if (-f $arg && File::Spec->file_name_is_absolute ($arg)) {
58	$file = $arg;
59	}
60	elsif ($arg =~ m/^\w+$/) {
61	$file = "unicore/${arg}_alias.pl";
62	}
63	else {
64	croak "Charnames alias files can only have identifier characters";
65	}
66	if (my @alias = do $file) {
67	@alias == 1 && !defined $alias[0] and
68	croak "$file cannot be used as alias file for charnames";
69	@alias % 2 and
70	croak "$file did not return a (valid) list of alias pairs";
71	alias (@alias);
72	return (1);
73	}
74	0;
75	} # alias_file
76
77	# This is not optimized in any way yet
78	sub charnames
79	{
80	my $name = shift;
81
82	if (exists $alias1{$name}) {
83	$name = $alias1{$name};
84	}
85	elsif (exists $alias2{$name}) {
86	require warnings;
87	warnings::warnif('deprecated', qq{Unicode character name "$name" is deprecated, use "$alias2{$name}" instead});
88	$name = $alias2{$name};
89	}
90	elsif (exists $alias3{$name}) {
91	$name = $alias3{$name};
92	}
93
94	my $ord;
95	my @off;
96	my $fname;
97
98	if ($name eq "BYTE ORDER MARK") {
99	$fname = $name;
100	$ord = 0xFEFF;
101	} else {
102	## Suck in the code/name list as a big string.
103	## Lines look like:
104	## "0052\t\tLATIN CAPITAL LETTER R\n"
105	$txt = do "unicore/Name.pl" unless $txt;
106
107	## @off will hold the index into the code/name string of the start and
108	## end of the name as we find it.
109
110	## If :full, look for the name exactly
111	if ($^H{charnames_full} and $txt =~ /\t\t\Q$name\E$/m) {
112	@off = ($-[0], $+[0]);
113	}
114
115	## If we didn't get above, and :short allowed, look for the short name.
116	## The short name is like "greek:Sigma"
117	unless (@off) {
118	if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) {
119	my ($script, $cname) = ($1, $2);
120	my $case = $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
121	if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U\Q$cname\E$/m) {
122	@off = ($-[0], $+[0]);
123	}
124	}
125	}
126
127	## If we still don't have it, check for the name among the loaded
128	## scripts.
129	if (not @off) {
130	my $case = $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
131	for my $script (@{$^H{charnames_scripts}}) {
132	if ($txt =~ m/\t\t$script (?:$case )?LETTER \U\Q$name\E$/m) {
133	@off = ($-[0], $+[0]);
134	last;
135	}
136	}
137	}
138
139	## If we don't have it by now, give up.
140	unless (@off) {
141	carp "Unknown charname '$name'";
142	return "\x{FFFD}";
143	}
144
145	##
146	## Now know where in the string the name starts.
147	## The code, in hex, is before that.
148	##
149	## The code can be 4-6 characters long, so we've got to sort of
150	## go look for it, just after the newline that comes before $off[0].
151	##
152	## This would be much easier if unicore/Name.pl had info in
153	## a name/code order, instead of code/name order.
154	##
155	## The +1 after the rindex() is to skip past the newline we're finding,
156	## or, if the rindex() fails, to put us to an offset of zero.
157	##
158	my $hexstart = rindex($txt, "\n", $off[0]) + 1;
159
160	## we know where it starts, so turn into number -
161	## the ordinal for the char.
162	$ord = hex substr($txt, $hexstart, $off[0] - $hexstart);
163	}
164
165	if ($^H & $bytes::hint_bits) { # "use bytes" in effect?
166	use bytes;
167	return chr $ord if $ord <= 255;
168	my $hex = sprintf "%04x", $ord;
169	if (not defined $fname) {
170	$fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
171	}
172	croak "Character 0x$hex with name '$fname' is above 0xFF";
173	}
174
175	no warnings 'utf8'; # allow even illegal characters
176	return pack "U", $ord;
177	} # charnames
178
179	sub import
180	{
181	shift; ## ignore class name
182
183	if (not @_) {
184	carp("`use charnames' needs explicit imports list");
185	}
186	$^H \|= $charnames::hint_bits;
187	$^H{charnames} = \&charnames ;
188
189	##