| 1 | package charnames;
|
|---|
| 2 | use strict;
|
|---|
| 3 | use warnings;
|
|---|
| 4 | use Carp;
|
|---|
| 5 | use File::Spec;
|
|---|
| 6 | our $VERSION = '1.05';
|
|---|
| 7 |
|
|---|
| 8 | use bytes (); # for $bytes::hint_bits
|
|---|
| 9 | $charnames::hint_bits = 0x20000; # HINT_LOCALIZE_HH
|
|---|
| 10 |
|
|---|
| 11 | my %alias1 = (
|
|---|
| 12 | # Icky 3.2 names with parentheses.
|
|---|
| 13 | 'LINE FEED' => 'LINE FEED (LF)',
|
|---|
| 14 | 'FORM FEED' => 'FORM FEED (FF)',
|
|---|
| 15 | 'CARRIAGE RETURN' => 'CARRIAGE RETURN (CR)',
|
|---|
| 16 | 'NEXT LINE' => 'NEXT LINE (NEL)',
|
|---|
| 17 | # Convenience.
|
|---|
| 18 | 'LF' => 'LINE FEED (LF)',
|
|---|
| 19 | 'FF' => 'FORM FEED (FF)',
|
|---|
| 20 | 'CR' => 'CARRIAGE RETURN (CR)',
|
|---|
| 21 | 'NEL' => 'NEXT LINE (NEL)',
|
|---|
| 22 | # More convenience. For futher convencience,
|
|---|
| 23 | # it is suggested some way using using the NamesList
|
|---|
| 24 | # aliases is implemented.
|
|---|
| 25 | 'ZWNJ' => 'ZERO WIDTH NON-JOINER',
|
|---|
| 26 | 'ZWJ' => 'ZERO WIDTH JOINER',
|
|---|
| 27 | 'BOM' => 'BYTE ORDER MARK',
|
|---|
| 28 | );
|
|---|
| 29 |
|
|---|
| 30 | my %alias2 = (
|
|---|
| 31 | # Pre-3.2 compatibility (only for the first 256 characters).
|
|---|
| 32 | 'HORIZONTAL TABULATION' => 'CHARACTER TABULATION',
|
|---|
| 33 | 'VERTICAL TABULATION' => 'LINE TABULATION',
|
|---|
| 34 | 'FILE SEPARATOR' => 'INFORMATION SEPARATOR FOUR',
|
|---|
| 35 | 'GROUP SEPARATOR' => 'INFORMATION SEPARATOR THREE',
|
|---|
| 36 | 'RECORD SEPARATOR' => 'INFORMATION SEPARATOR TWO',
|
|---|
| 37 | 'UNIT SEPARATOR' => 'INFORMATION SEPARATOR ONE',
|
|---|
| 38 | 'PARTIAL LINE DOWN' => 'PARTIAL LINE FORWARD',
|
|---|
| 39 | 'PARTIAL LINE UP' => 'PARTIAL LINE BACKWARD',
|
|---|
| 40 | );
|
|---|
| 41 |
|
|---|
| 42 | my %alias3 = (
|
|---|
| 43 | # User defined aliasses. Even more convenient :)
|
|---|
| 44 | );
|
|---|
| 45 | my $txt;
|
|---|
| 46 |
|
|---|
| 47 | sub alias (@)
|
|---|
| 48 | {
|
|---|
| 49 | @_ or return %alias3;
|
|---|
| 50 | my $alias = ref $_[0] ? $_[0] : { @_ };
|
|---|
| 51 | @alias3{keys %$alias} = values %$alias;
|
|---|
| 52 | } # alias
|
|---|
| 53 |
|
|---|
| 54 | sub alias_file ($)
|
|---|
| 55 | {
|
|---|
| 56 | my ($arg, $file) = @_;
|
|---|
| 57 | if (-f $arg && File::Spec->file_name_is_absolute ($arg)) {
|
|---|
| 58 | $file = $arg;
|
|---|
| 59 | }
|
|---|
| 60 | elsif ($arg =~ m/^\w+$/) {
|
|---|
| 61 | $file = "unicore/${arg}_alias.pl";
|
|---|
| 62 | }
|
|---|
| 63 | else {
|
|---|
| 64 | croak "Charnames alias files can only have identifier characters";
|
|---|
| 65 | }
|
|---|
| 66 | if (my @alias = do $file) {
|
|---|
| 67 | @alias == 1 && !defined $alias[0] and
|
|---|
| 68 | croak "$file cannot be used as alias file for charnames";
|
|---|
| 69 | @alias % 2 and
|
|---|
| 70 | croak "$file did not return a (valid) list of alias pairs";
|
|---|
| 71 | alias (@alias);
|
|---|
| 72 | return (1);
|
|---|
| 73 | }
|
|---|
| 74 | 0;
|
|---|
| 75 | } # alias_file
|
|---|
| 76 |
|
|---|
| 77 | # This is not optimized in any way yet
|
|---|
| 78 | sub charnames
|
|---|
| 79 | {
|
|---|
| 80 | my $name = shift;
|
|---|
| 81 |
|
|---|
| 82 | if (exists $alias1{$name}) {
|
|---|
| 83 | $name = $alias1{$name};
|
|---|
| 84 | }
|
|---|
| 85 | elsif (exists $alias2{$name}) {
|
|---|
| 86 | require warnings;
|
|---|
| 87 | warnings::warnif('deprecated', qq{Unicode character name "$name" is deprecated, use "$alias2{$name}" instead});
|
|---|
| 88 | $name = $alias2{$name};
|
|---|
| 89 | }
|
|---|
| 90 | elsif (exists $alias3{$name}) {
|
|---|
| 91 | $name = $alias3{$name};
|
|---|
| 92 | }
|
|---|
| 93 |
|
|---|
| 94 | my $ord;
|
|---|
| 95 | my @off;
|
|---|
| 96 | my $fname;
|
|---|
| 97 |
|
|---|
| 98 | if ($name eq "BYTE ORDER MARK") {
|
|---|
| 99 | $fname = $name;
|
|---|
| 100 | $ord = 0xFEFF;
|
|---|
| 101 | } else {
|
|---|
| 102 | ## Suck in the code/name list as a big string.
|
|---|
| 103 | ## Lines look like:
|
|---|
| 104 | ## "0052\t\tLATIN CAPITAL LETTER R\n"
|
|---|
| 105 | $txt = do "unicore/Name.pl" unless $txt;
|
|---|
| 106 |
|
|---|
| 107 | ## @off will hold the index into the code/name string of the start and
|
|---|
| 108 | ## end of the name as we find it.
|
|---|
| 109 |
|
|---|
| 110 | ## If :full, look for the name exactly
|
|---|
| 111 | if ($^H{charnames_full} and $txt =~ /\t\t\Q$name\E$/m) {
|
|---|
| 112 | @off = ($-[0], $+[0]);
|
|---|
| 113 | }
|
|---|
| 114 |
|
|---|
| 115 | ## If we didn't get above, and :short allowed, look for the short name.
|
|---|
| 116 | ## The short name is like "greek:Sigma"
|
|---|
| 117 | unless (@off) {
|
|---|
| 118 | if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) {
|
|---|
| 119 | my ($script, $cname) = ($1, $2);
|
|---|
| 120 | my $case = $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
|
|---|
| 121 | if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U\Q$cname\E$/m) {
|
|---|
| 122 | @off = ($-[0], $+[0]);
|
|---|
| 123 | }
|
|---|
| 124 | }
|
|---|
| 125 | }
|
|---|
| 126 |
|
|---|
| 127 | ## If we still don't have it, check for the name among the loaded
|
|---|
| 128 | ## scripts.
|
|---|
| 129 | if (not @off) {
|
|---|
| 130 | my $case = $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
|
|---|
| 131 | for my $script (@{$^H{charnames_scripts}}) {
|
|---|
| 132 | if ($txt =~ m/\t\t$script (?:$case )?LETTER \U\Q$name\E$/m) {
|
|---|
| 133 | @off = ($-[0], $+[0]);
|
|---|
| 134 | last;
|
|---|
| 135 | }
|
|---|
| 136 | }
|
|---|
| 137 | }
|
|---|
| 138 |
|
|---|
| 139 | ## If we don't have it by now, give up.
|
|---|
| 140 | unless (@off) {
|
|---|
| 141 | carp "Unknown charname '$name'";
|
|---|
| 142 | return "\x{FFFD}";
|
|---|
| 143 | }
|
|---|
| 144 |
|
|---|
| 145 | ##
|
|---|
| 146 | ## Now know where in the string the name starts.
|
|---|
| 147 | ## The code, in hex, is before that.
|
|---|
| 148 | ##
|
|---|
| 149 | ## The code can be 4-6 characters long, so we've got to sort of
|
|---|
| 150 | ## go look for it, just after the newline that comes before $off[0].
|
|---|
| 151 | ##
|
|---|
| 152 | ## This would be much easier if unicore/Name.pl had info in
|
|---|
| 153 | ## a name/code order, instead of code/name order.
|
|---|
| 154 | ##
|
|---|
| 155 | ## The +1 after the rindex() is to skip past the newline we're finding,
|
|---|
| 156 | ## or, if the rindex() fails, to put us to an offset of zero.
|
|---|
| 157 | ##
|
|---|
| 158 | my $hexstart = rindex($txt, "\n", $off[0]) + 1;
|
|---|
| 159 |
|
|---|
| 160 | ## we know where it starts, so turn into number -
|
|---|
| 161 | ## the ordinal for the char.
|
|---|
| 162 | $ord = hex substr($txt, $hexstart, $off[0] - $hexstart);
|
|---|
| 163 | }
|
|---|
| 164 |
|
|---|
| 165 | if ($^H & $bytes::hint_bits) { # "use bytes" in effect?
|
|---|
| 166 | use bytes;
|
|---|
| 167 | return chr $ord if $ord <= 255;
|
|---|
| 168 | my $hex = sprintf "%04x", $ord;
|
|---|
| 169 | if (not defined $fname) {
|
|---|
| 170 | $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
|
|---|
| 171 | }
|
|---|
| 172 | croak "Character 0x$hex with name '$fname' is above 0xFF";
|
|---|
| 173 | }
|
|---|
| 174 |
|
|---|
| 175 | no warnings 'utf8'; # allow even illegal characters
|
|---|
| 176 | return pack "U", $ord;
|
|---|
| 177 | } # charnames
|
|---|
| 178 |
|
|---|
| 179 | sub import
|
|---|
| 180 | {
|
|---|
| 181 | shift; ## ignore class name
|
|---|
| 182 |
|
|---|
| 183 | if (not @_) {
|
|---|
| 184 | carp("`use charnames' needs explicit imports list");
|
|---|
| 185 | }
|
|---|
| 186 | $^H |= $charnames::hint_bits;
|
|---|
| 187 | $^H{charnames} = \&charnames ;
|
|---|
| 188 |
|
|---|
| 189 | ##
|
|---|
| 190 | ## fill %h keys with our @_ args.
|
|---|
| 191 | ##
|
|---|
| 192 | my ($promote, %h, @args) = (0);
|
|---|
| 193 | while (my $arg = shift) {
|
|---|
| 194 | if ($arg eq ":alias") {
|
|---|
| 195 | @_ or
|
|---|
| 196 | croak ":alias needs an argument in charnames";
|
|---|
| 197 | my $alias = shift;
|
|---|
| 198 | if (ref $alias) {
|
|---|
| 199 | ref $alias eq "HASH" or
|
|---|
| 200 | croak "Only HASH reference supported as argument to :alias";
|
|---|
| 201 | alias ($alias);
|
|---|
| 202 | next;
|
|---|
| 203 | }
|
|---|
| 204 | if ($alias =~ m{:(\w+)$}) {
|
|---|
| 205 | $1 eq "full" || $1 eq "short" and
|
|---|
| 206 | croak ":alias cannot use existing pragma :$1 (reversed order?)";
|
|---|
| 207 | alias_file ($1) and $promote = 1;
|
|---|
| 208 | next;
|
|---|
| 209 | }
|
|---|
| 210 | alias_file ($alias);
|
|---|
| 211 | next;
|
|---|
| 212 | }
|
|---|
| 213 | if (substr($arg, 0, 1) eq ':' and ! ($arg eq ":full" || $arg eq ":short")) {
|
|---|
| 214 | warn "unsupported special '$arg' in charnames";
|
|---|
| 215 | next;
|
|---|
| 216 | }
|
|---|
| 217 | push @args, $arg;
|
|---|
| 218 | }
|
|---|
| 219 | @args == 0 && $promote and @args = (":full");
|
|---|
| 220 | @h{@args} = (1) x @args;
|
|---|
| 221 |
|
|---|
| 222 | $^H{charnames_full} = delete $h{':full'};
|
|---|
| 223 | $^H{charnames_short} = delete $h{':short'};
|
|---|
| 224 | $^H{charnames_scripts} = [map uc, keys %h];
|
|---|
| 225 |
|
|---|
| 226 | ##
|
|---|
| 227 | ## If utf8? warnings are enabled, and some scripts were given,
|
|---|
| 228 | ## see if at least we can find one letter of each script.
|
|---|
| 229 | ##
|
|---|
| 230 | if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) {
|
|---|
| 231 | $txt = do "unicore/Name.pl" unless $txt;
|
|---|
| 232 |
|
|---|
| 233 | for my $script (@{$^H{charnames_scripts}}) {
|
|---|
| 234 | if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) {
|
|---|
| 235 | warnings::warn('utf8', "No such script: '$script'");
|
|---|
| 236 | }
|
|---|
| 237 | }
|
|---|
| 238 | }
|
|---|
| 239 | } # import
|
|---|
| 240 |
|
|---|
| 241 | my %viacode;
|
|---|
| 242 |
|
|---|
| 243 | sub viacode
|
|---|
| 244 | {
|
|---|
| 245 | if (@_ != 1) {
|
|---|
| 246 | carp "charnames::viacode() expects one argument";
|
|---|
| 247 | return;
|
|---|
| 248 | }
|
|---|
| 249 |
|
|---|
| 250 | my $arg = shift;
|
|---|
| 251 |
|
|---|
| 252 | # this comes actually from Unicode::UCD, where it is the named
|
|---|
| 253 | # function _getcode (), but it avoids the overhead of loading it
|
|---|
| 254 | my $hex;
|
|---|
| 255 | if ($arg =~ /^[1-9]\d*$/) {
|
|---|
| 256 | $hex = sprintf "%04X", $arg;
|
|---|
| 257 | } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
|
|---|
| 258 | $hex = $1;
|
|---|
| 259 | } else {
|
|---|
| 260 | carp("unexpected arg \"$arg\" to charnames::viacode()");
|
|---|
| 261 | return;
|
|---|
| 262 | }
|
|---|
| 263 |
|
|---|
| 264 | # checking the length first is slightly faster
|
|---|
| 265 | if (length($hex) > 5 && hex($hex) > 0x10FFFF) {
|
|---|
| 266 | carp sprintf "Unicode characters only allocated up to U+10FFFF (you asked for U+%X)", $hex;
|
|---|
| 267 | return;
|
|---|
| 268 | }
|
|---|
| 269 |
|
|---|
| 270 | return $viacode{$hex} if exists $viacode{$hex};
|
|---|
| 271 |
|
|---|
| 272 | $txt = do "unicore/Name.pl" unless $txt;
|
|---|
| 273 |
|
|---|
| 274 | return unless $txt =~ m/^$hex\t\t(.+)/m;
|
|---|
| 275 |
|
|---|
| 276 | $viacode{$hex} = $1;
|
|---|
| 277 | } # viacode
|
|---|
| 278 |
|
|---|
| 279 | my %vianame;
|
|---|
| 280 |
|
|---|
| 281 | sub vianame
|
|---|
| 282 | {
|
|---|
| 283 | if (@_ != 1) {
|
|---|
| 284 | carp "charnames::vianame() expects one name argument";
|
|---|
| 285 | return ()
|
|---|
| 286 | }
|
|---|
| 287 |
|
|---|
| 288 | my $arg = shift;
|
|---|
| 289 |
|
|---|
| 290 | return chr hex $1 if $arg =~ /^U\+([0-9a-fA-F]+)$/;
|
|---|
| 291 |
|
|---|
| 292 | return $vianame{$arg} if exists $vianame{$arg};
|
|---|
| 293 |
|
|---|
| 294 | $txt = do "unicore/Name.pl" unless $txt;
|
|---|
| 295 |
|
|---|
| 296 | my $pos = index $txt, "\t\t$arg\n";
|
|---|
| 297 | if ($[ <= $pos) {
|
|---|
| 298 | my $posLF = rindex $txt, "\n", $pos;
|
|---|
| 299 | (my $code = substr $txt, $posLF + 1, 6) =~ tr/\t//d;
|
|---|
| 300 | return $vianame{$arg} = hex $code;
|
|---|
| 301 |
|
|---|
| 302 | # If $pos is at the 1st line, $posLF must be $[ - 1 (not found);
|
|---|
| 303 | # then $posLF + 1 equals to $[ (at the beginning of $txt).
|
|---|
| 304 | # Otherwise $posLF is the position of "\n";
|
|---|
| 305 | # then $posLF + 1 must be the position of the next to "\n"
|
|---|
| 306 | # (the beginning of the line).
|
|---|
| 307 | # substr($txt, $posLF + 1, 6) may be "0000\t\t", "00A1\t\t",
|
|---|
| 308 | # "10300\t", "100000", etc. So we can get the code via removing TAB.
|
|---|
| 309 | } else {
|
|---|
| 310 | return;
|
|---|
| 311 | }
|
|---|
| 312 | } # vianame
|
|---|
| 313 |
|
|---|
| 314 |
|
|---|
| 315 | 1;
|
|---|
| 316 | __END__
|
|---|
| 317 |
|
|---|
| 318 | =head1 NAME
|
|---|
| 319 |
|
|---|
| 320 | charnames - define character names for C<\N{named}> string literal escapes
|
|---|
| 321 |
|
|---|
| 322 | =head1 SYNOPSIS
|
|---|
| 323 |
|
|---|
| 324 | use charnames ':full';
|
|---|
| 325 | print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
|
|---|
| 326 |
|
|---|
| 327 | use charnames ':short';
|
|---|
| 328 | print "\N{greek:Sigma} is an upper-case sigma.\n";
|
|---|
| 329 |
|
|---|
| 330 | use charnames qw(cyrillic greek);
|
|---|
| 331 | print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
|
|---|
| 332 |
|
|---|
| 333 | use charnames ":full", ":alias" => {
|
|---|
| 334 | e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
|
|---|
| 335 | };
|
|---|
| 336 | print "\N{e_ACUTE} is a small letter e with an acute.\n";
|
|---|
| 337 |
|
|---|
| 338 | use charnames ();
|
|---|
| 339 | print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
|
|---|
| 340 | printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints "10330"
|
|---|
| 341 |
|
|---|
| 342 | =head1 DESCRIPTION
|
|---|
| 343 |
|
|---|
| 344 | Pragma C<use charnames> supports arguments C<:full>, C<:short>, script
|
|---|
| 345 | names and customized aliases. If C<:full> is present, for expansion of
|
|---|
| 346 | C<\N{CHARNAME}>, the string C<CHARNAME> is first looked up in the list of
|
|---|
| 347 | standard Unicode character names. If C<:short> is present, and
|
|---|
| 348 | C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
|
|---|
| 349 | as a letter in script C<SCRIPT>. If pragma C<use charnames> is used
|
|---|
| 350 | with script name arguments, then for C<\N{CHARNAME}> the name
|
|---|
| 351 | C<CHARNAME> is looked up as a letter in the given scripts (in the
|
|---|
| 352 | specified order). Customized aliases are explained in L</CUSTOM ALIASES>.
|
|---|
| 353 |
|
|---|
| 354 | For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
|
|---|
| 355 | this pragma looks for the names
|
|---|
| 356 |
|
|---|
| 357 | SCRIPTNAME CAPITAL LETTER CHARNAME
|
|---|
| 358 | SCRIPTNAME SMALL LETTER CHARNAME
|
|---|
| 359 | SCRIPTNAME LETTER CHARNAME
|
|---|
| 360 |
|
|---|
| 361 | in the table of standard Unicode names. If C<CHARNAME> is lowercase,
|
|---|
| 362 | then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant
|
|---|
| 363 | is ignored.
|
|---|
| 364 |
|
|---|
| 365 | Note that C<\N{...}> is compile-time, it's a special form of string
|
|---|
| 366 | constant used inside double-quoted strings: in other words, you cannot
|
|---|
| 367 | use variables inside the C<\N{...}>. If you want similar run-time
|
|---|
| 368 | functionality, use charnames::vianame().
|
|---|
| 369 |
|
|---|
| 370 | For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F)
|
|---|
| 371 | as of Unicode 3.1, there are no official Unicode names but you can use
|
|---|
| 372 | instead the ISO 6429 names (LINE FEED, ESCAPE, and so forth). In
|
|---|
| 373 | Unicode 3.2 (as of Perl 5.8) some naming changes take place ISO 6429
|
|---|
| 374 | has been updated, see L</ALIASES>. Also note that the U+UU80, U+0081,
|
|---|
| 375 | U+0084, and U+0099 do not have names even in ISO 6429.
|
|---|
| 376 |
|
|---|
| 377 | Since the Unicode standard uses "U+HHHH", so can you: "\N{U+263a}"
|
|---|
| 378 | is the Unicode smiley face, or "\N{WHITE SMILING FACE}".
|
|---|
| 379 |
|
|---|
| 380 | =head1 CUSTOM TRANSLATORS
|
|---|
| 381 |
|
|---|
| 382 | The mechanism of translation of C<\N{...}> escapes is general and not
|
|---|
| 383 | hardwired into F<charnames.pm>. A module can install custom
|
|---|
| 384 | translations (inside the scope which C<use>s the module) with the
|
|---|
| 385 | following magic incantation:
|
|---|
| 386 |
|
|---|
| 387 | use charnames (); # for $charnames::hint_bits
|
|---|
| 388 | sub import {
|
|---|
| 389 | shift;
|
|---|
| 390 | $^H |= $charnames::hint_bits;
|
|---|
| 391 | $^H{charnames} = \&translator;
|
|---|
| 392 | }
|
|---|
| 393 |
|
|---|
| 394 | Here translator() is a subroutine which takes C<CHARNAME> as an
|
|---|
| 395 | argument, and returns text to insert into the string instead of the
|
|---|
| 396 | C<\N{CHARNAME}> escape. Since the text to insert should be different
|
|---|
| 397 | in C<bytes> mode and out of it, the function should check the current
|
|---|
| 398 | state of C<bytes>-flag as in:
|
|---|
| 399 |
|
|---|
| 400 | use bytes (); # for $bytes::hint_bits
|
|---|
| 401 | sub translator {
|
|---|
| 402 | if ($^H & $bytes::hint_bits) {
|
|---|
| 403 | return bytes_translator(@_);
|
|---|
| 404 | }
|
|---|
| 405 | else {
|
|---|
| 406 | return utf8_translator(@_);
|
|---|
| 407 | }
|
|---|
| 408 | }
|
|---|
| 409 |
|
|---|
| 410 | =head1 CUSTOM ALIASES
|
|---|
| 411 |
|
|---|
| 412 | This version of charnames supports three mechanisms of adding local
|
|---|
| 413 | or customized aliases to standard Unicode naming conventions (:full)
|
|---|
| 414 |
|
|---|
| 415 | =head2 Anonymous hashes
|
|---|
| 416 |
|
|---|
| 417 | use charnames ":full", ":alias" => {
|
|---|
| 418 | e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
|
|---|
| 419 | };
|
|---|
| 420 | my $str = "\N{e_ACUTE}";
|
|---|
| 421 |
|
|---|
| 422 | =head2 Alias file
|
|---|
| 423 |
|
|---|
| 424 | use charnames ":full", ":alias" => "pro";
|
|---|
| 425 |
|
|---|
| 426 | will try to read "unicore/pro_alias.pl" from the @INC path. This
|
|---|
| 427 | file should return a list in plain perl:
|
|---|
| 428 |
|
|---|
| 429 | (
|
|---|
| 430 | A_GRAVE => "LATIN CAPITAL LETTER A WITH GRAVE",
|
|---|
| 431 | A_CIRCUM => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX",
|
|---|
| 432 | A_DIAERES => "LATIN CAPITAL LETTER A WITH DIAERESIS",
|
|---|
| 433 | A_TILDE => "LATIN CAPITAL LETTER A WITH TILDE",
|
|---|
| 434 | A_BREVE => "LATIN CAPITAL LETTER A WITH BREVE",
|
|---|
| 435 | A_RING => "LATIN CAPITAL LETTER A WITH RING ABOVE",
|
|---|
| 436 | A_MACRON => "LATIN CAPITAL LETTER A WITH MACRON",
|
|---|
| 437 | );
|
|---|
| 438 |
|
|---|
| 439 | =head2 Alias shortcut
|
|---|
| 440 |
|
|---|
| 441 | use charnames ":alias" => ":pro";
|
|---|
| 442 |
|
|---|
| 443 | works exactly the same as the alias pairs, only this time,
|
|---|
| 444 | ":full" is inserted automatically as first argument (if no
|
|---|
| 445 | other argument is given).
|
|---|
| 446 |
|
|---|
| 447 | =head1 charnames::viacode(code)
|
|---|
| 448 |
|
|---|
| 449 | Returns the full name of the character indicated by the numeric code.
|
|---|
| 450 | The example
|
|---|
| 451 |
|
|---|
| 452 | print charnames::viacode(0x2722);
|
|---|
| 453 |
|
|---|
| 454 | prints "FOUR TEARDROP-SPOKED ASTERISK".
|
|---|
| 455 |
|
|---|
| 456 | Returns undef if no name is known for the code.
|
|---|
| 457 |
|
|---|
| 458 | This works only for the standard names, and does not yet apply
|
|---|
| 459 | to custom translators.
|
|---|
| 460 |
|
|---|
| 461 | Notice that the name returned for of U+FEFF is "ZERO WIDTH NO-BREAK
|
|---|
| 462 | SPACE", not "BYTE ORDER MARK".
|
|---|
| 463 |
|
|---|
| 464 | =head1 charnames::vianame(name)
|
|---|
| 465 |
|
|---|
| 466 | Returns the code point indicated by the name.
|
|---|
| 467 | The example
|
|---|
| 468 |
|
|---|
| 469 | printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");
|
|---|
| 470 |
|
|---|
| 471 | prints "2722".
|
|---|
| 472 |
|
|---|
| 473 | Returns undef if the name is unknown.
|
|---|
| 474 |
|
|---|
| 475 | This works only for the standard names, and does not yet apply
|
|---|
| 476 | to custom translators.
|
|---|
| 477 |
|
|---|
| 478 | =head1 ALIASES
|
|---|
| 479 |
|
|---|
| 480 | A few aliases have been defined for convenience: instead of having
|
|---|
| 481 | to use the official names
|
|---|
| 482 |
|
|---|
| 483 | LINE FEED (LF)
|
|---|
| 484 | FORM FEED (FF)
|
|---|
| 485 | CARRIAGE RETURN (CR)
|
|---|
| 486 | NEXT LINE (NEL)
|
|---|
| 487 |
|
|---|
| 488 | (yes, with parentheses) one can use
|
|---|
| 489 |
|
|---|
| 490 | LINE FEED
|
|---|
| 491 | FORM FEED
|
|---|
| 492 | CARRIAGE RETURN
|
|---|
| 493 | NEXT LINE
|
|---|
| 494 | LF
|
|---|
| 495 | FF
|
|---|
| 496 | CR
|
|---|
| 497 | NEL
|
|---|
| 498 |
|
|---|
| 499 | One can also use
|
|---|
| 500 |
|
|---|
| 501 | BYTE ORDER MARK
|
|---|
| 502 | BOM
|
|---|
| 503 |
|
|---|
| 504 | and
|
|---|
| 505 |
|
|---|
| 506 | ZWNJ
|
|---|
| 507 | ZWJ
|
|---|
| 508 |
|
|---|
| 509 | for ZERO WIDTH NON-JOINER and ZERO WIDTH JOINER.
|
|---|
| 510 |
|
|---|
| 511 | For backward compatibility one can use the old names for
|
|---|
| 512 | certain C0 and C1 controls
|
|---|
| 513 |
|
|---|
| 514 | old new
|
|---|
| 515 |
|
|---|
| 516 | HORIZONTAL TABULATION CHARACTER TABULATION
|
|---|
| 517 | VERTICAL TABULATION LINE TABULATION
|
|---|
| 518 | FILE SEPARATOR INFORMATION SEPARATOR FOUR
|
|---|
| 519 | GROUP SEPARATOR INFORMATION SEPARATOR THREE
|
|---|
| 520 | RECORD SEPARATOR INFORMATION SEPARATOR TWO
|
|---|
| 521 | UNIT SEPARATOR INFORMATION SEPARATOR ONE
|
|---|
| 522 | PARTIAL LINE DOWN PARTIAL LINE FORWARD
|
|---|
| 523 | PARTIAL LINE UP PARTIAL LINE BACKWARD
|
|---|
| 524 |
|
|---|
| 525 | but the old names in addition to giving the character
|
|---|
| 526 | will also give a warning about being deprecated.
|
|---|
| 527 |
|
|---|
| 528 | =head1 ILLEGAL CHARACTERS
|
|---|
| 529 |
|
|---|
| 530 | If you ask by name for a character that does not exist, a warning is
|
|---|
| 531 | given and the Unicode I<replacement character> "\x{FFFD}" is returned.
|
|---|
| 532 |
|
|---|
| 533 | If you ask by code for a character that does not exist, no warning is
|
|---|
| 534 | given and C<undef> is returned. (Though if you ask for a code point
|
|---|
| 535 | past U+10FFFF you do get a warning.)
|
|---|
| 536 |
|
|---|
| 537 | =head1 BUGS
|
|---|
| 538 |
|
|---|
| 539 | Since evaluation of the translation function happens in a middle of
|
|---|
| 540 | compilation (of a string literal), the translation function should not
|
|---|
| 541 | do any C<eval>s or C<require>s. This restriction should be lifted in
|
|---|
| 542 | a future version of Perl.
|
|---|
| 543 |
|
|---|
| 544 | =cut
|
|---|