| 1 | package charnames;
|
|---|
| 2 | use strict;
|
|---|
| 3 | use warnings;
|
|---|
| 4 | use Carp;
|
|---|
| 5 | use File::Spec;
|
|---|
| 6 | our $VERSION = '1.05';
|
|---|
| 7 |
|
|---|
| 8 | use bytes (); # for $bytes::hint_bits
|
|---|
| 9 | $charnames::hint_bits = 0x20000; # HINT_LOCALIZE_HH
|
|---|
| 10 |
|
|---|
| 11 | my %alias1 = (
|
|---|
| 12 | # Icky 3.2 names with parentheses.
|
|---|
| 13 | 'LINE FEED' => 'LINE FEED (LF)',
|
|---|
| 14 | 'FORM FEED' => 'FORM FEED (FF)',
|
|---|
| 15 | 'CARRIAGE RETURN' => 'CARRIAGE RETURN (CR)',
|
|---|
| 16 | 'NEXT LINE' => 'NEXT LINE (NEL)',
|
|---|
| 17 | # Convenience.
|
|---|
| 18 | 'LF' => 'LINE FEED (LF)',
|
|---|
| 19 | 'FF' => 'FORM FEED (FF)',
|
|---|
| 20 | 'CR' => 'CARRIAGE RETURN (CR)',
|
|---|
| 21 | 'NEL' => 'NEXT LINE (NEL)',
|
|---|
| 22 | # More convenience. For futher convencience,
|
|---|
| 23 | # it is suggested some way using using the NamesList
|
|---|
| 24 | # aliases is implemented.
|
|---|
| 25 | 'ZWNJ' => 'ZERO WIDTH NON-JOINER',
|
|---|
| 26 | 'ZWJ' => 'ZERO WIDTH JOINER',
|
|---|
| 27 | 'BOM' => 'BYTE ORDER MARK',
|
|---|
| 28 | );
|
|---|
| 29 |
|
|---|
| 30 | my %alias2 = (
|
|---|
| 31 | # Pre-3.2 compatibility (only for the first 256 characters).
|
|---|
| 32 | 'HORIZONTAL TABULATION' => 'CHARACTER TABULATION',
|
|---|
| 33 | 'VERTICAL TABULATION' => 'LINE TABULATION',
|
|---|
| 34 | 'FILE SEPARATOR' => 'INFORMATION SEPARATOR FOUR',
|
|---|
| 35 | 'GROUP SEPARATOR' => 'INFORMATION SEPARATOR THREE',
|
|---|
| 36 | 'RECORD SEPARATOR' => 'INFORMATION SEPARATOR TWO',
|
|---|
| 37 | 'UNIT SEPARATOR' => 'INFORMATION SEPARATOR ONE',
|
|---|
| 38 | 'PARTIAL LINE DOWN' => 'PARTIAL LINE FORWARD',
|
|---|
| 39 | 'PARTIAL LINE UP' => 'PARTIAL LINE BACKWARD',
|
|---|
| 40 | );
|
|---|
| 41 |
|
|---|
| 42 | my %alias3 = (
|
|---|
| 43 | # User defined aliasses. Even more convenient :)
|
|---|
| 44 | );
|
|---|
| 45 | my $txt;
|
|---|
| 46 |
|
|---|
| 47 | sub alias (@)
|
|---|
| 48 | {
|
|---|
| 49 | @_ or return %alias3;
|
|---|
| 50 | my $alias = ref $_[0] ? $_[0] : { @_ };
|
|---|
| 51 | @alias3{keys %$alias} = values %$alias;
|
|---|
| 52 | } # alias
|
|---|
| 53 |
|
|---|
| 54 | sub alias_file ($)
|
|---|
| 55 | {
|
|---|
| 56 | my ($arg, $file) = @_;
|
|---|
| 57 | if (-f $arg && File::Spec->file_name_is_absolute ($arg)) {
|
|---|
| 58 | $file = $arg;
|
|---|
| 59 | }
|
|---|
| 60 | elsif ($arg =~ m/^\w+$/) {
|
|---|
| 61 | $file = "unicore/${arg}_alias.pl";
|
|---|
| 62 | }
|
|---|
| 63 | else {
|
|---|
| 64 | croak "Charnames alias files can only have identifier characters";
|
|---|
| 65 | }
|
|---|
| 66 | if (my @alias = do $file) {
|
|---|
| 67 | @alias == 1 && !defined $alias[0] and
|
|---|
| 68 | croak "$file cannot be used as alias file for charnames";
|
|---|
| 69 | @alias % 2 and
|
|---|
| 70 | croak "$file did not return a (valid) list of alias pairs";
|
|---|
| 71 | alias (@alias);
|
|---|
| 72 | return (1);
|
|---|
| 73 | }
|
|---|
| 74 | 0;
|
|---|
| 75 | } # alias_file
|
|---|
| 76 |
|
|---|
| 77 | # This is not optimized in any way yet
|
|---|
| 78 | sub charnames
|
|---|
| 79 | {
|
|---|
| 80 | my $name = shift;
|
|---|
| 81 |
|
|---|
| 82 | if (exists $alias1{$name}) {
|
|---|
| 83 | $name = $alias1{$name};
|
|---|
| 84 | }
|
|---|
| 85 | elsif (exists $alias2{$name}) {
|
|---|
| 86 | require warnings;
|
|---|
| 87 | warnings::warnif('deprecated', qq{Unicode character name "$name" is deprecated, use "$alias2{$name}" instead});
|
|---|
| 88 | $name = $alias2{$name};
|
|---|
| 89 | }
|
|---|
| 90 | elsif (exists $alias3{$name}) {
|
|---|
| 91 | $name = $alias3{$name};
|
|---|
| 92 | }
|
|---|
| 93 |
|
|---|
| 94 | my $ord;
|
|---|
| 95 | my @off;
|
|---|
| 96 | my $fname;
|
|---|
| 97 |
|
|---|
| 98 | if ($name eq "BYTE ORDER MARK") {
|
|---|
| 99 | $fname = $name;
|
|---|
| 100 | $ord = 0xFEFF;
|
|---|
| 101 | } else {
|
|---|
| 102 | ## Suck in the code/name list as a big string.
|
|---|
| 103 | ## Lines look like:
|
|---|
| 104 | ## "0052\t\tLATIN CAPITAL LETTER R\n"
|
|---|
| 105 | $txt = do "unicore/Name.pl" unless $txt;
|
|---|
| 106 |
|
|---|
| 107 | ## @off will hold the index into the code/name string of the start and
|
|---|
| 108 | ## end of the name as we find it.
|
|---|
| 109 |
|
|---|
| 110 | ## If :full, look for the name exactly
|
|---|
| 111 | if ($^H{charnames_full} and $txt =~ /\t\t\Q$name\E$/m) {
|
|---|
| 112 | @off = ($-[0], $+[0]);
|
|---|
| 113 | }
|
|---|
| 114 |
|
|---|
| 115 | ## If we didn't get above, and :short allowed, look for the short name.
|
|---|
| 116 | ## The short name is like "greek:Sigma"
|
|---|
| 117 | unless (@off) {
|
|---|
| 118 | if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) {
|
|---|
| 119 | my ($script, $cname) = ($1, $2);
|
|---|
| 120 | my $case = $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
|
|---|
| 121 | if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U\Q$cname\E$/m) {
|
|---|
| 122 | @off = ($-[0], $+[0]);
|
|---|
| 123 | }
|
|---|
| 124 | }
|
|---|
| 125 | }
|
|---|
| 126 |
|
|---|
| 127 | ## If we still don't have it, check for the name among the loaded
|
|---|
| 128 | ## scripts.
|
|---|
| 129 | if (not @off) {
|
|---|
| 130 | my $case = $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
|
|---|
| 131 | for my $script (@{$^H{charnames_scripts}}) {
|
|---|
| 132 | if ($txt =~ m/\t\t$script (?:$case )?LETTER \U\Q$name\E$/m) {
|
|---|
| 133 | @off = ($-[0], $+[0]);
|
|---|
| 134 | last;
|
|---|
| 135 | }
|
|---|
| 136 | }
|
|---|
| 137 | }
|
|---|
| 138 |
|
|---|
| 139 | ## If we don't have it by now, give up.
|
|---|
| 140 | unless (@off) {
|
|---|
| 141 | carp "Unknown charname '$name'";
|
|---|
| 142 | return "\x{FFFD}";
|
|---|
| 143 | }
|
|---|
| 144 |
|
|---|
| 145 | ##
|
|---|
| 146 | ## Now know where in the string the name starts.
|
|---|
| 147 | ## The code, in hex, is before that.
|
|---|
| 148 | ##
|
|---|
| 149 | ## The code can be 4-6 characters long, so we've got to sort of
|
|---|
| 150 | ## go look for it, just after the newline that comes before $off[0].
|
|---|
| 151 | ##
|
|---|
| 152 | ## This would be much easier if unicore/Name.pl had info in
|
|---|
| 153 | ## a name/code order, instead of code/name order.
|
|---|
| 154 | ##
|
|---|
| 155 | ## The +1 after the rindex() is to skip past the newline we're finding,
|
|---|
| 156 | ## or, if the rindex() fails, to put us to an offset of zero.
|
|---|
| 157 | ##
|
|---|
| 158 | my $hexstart = rindex($txt, "\n", $off[0]) + 1;
|
|---|
| 159 |
|
|---|
| 160 | ## we know where it starts, so turn into number -
|
|---|
| 161 | ## the ordinal for the char.
|
|---|
| 162 | $ord = hex substr($txt, $hexstart, $off[0] - $hexstart);
|
|---|
| 163 | }
|
|---|
| 164 |
|
|---|
| 165 | if ($^H & $bytes::hint_bits) { # "use bytes" in effect?
|
|---|
| 166 | use bytes;
|
|---|
| 167 | return chr $ord if $ord <= 255;
|
|---|
| 168 | my $hex = sprintf "%04x", $ord;
|
|---|
| 169 | if (not defined $fname) {
|
|---|
| 170 | $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
|
|---|
| 171 | }
|
|---|
| 172 | croak "Character 0x$hex with name '$fname' is above 0xFF";
|
|---|
| 173 | }
|
|---|
| 174 |
|
|---|
| 175 | no warnings 'utf8'; # allow even illegal characters
|
|---|
| 176 | return pack "U", $ord;
|
|---|
| 177 | } # charnames
|
|---|
| 178 |
|
|---|
| 179 | sub import
|
|---|
| 180 | {
|
|---|
| 181 | shift; ## ignore class name
|
|---|
| 182 |
|
|---|
| 183 | if (not @_) {
|
|---|
| 184 | carp("`use charnames' needs explicit imports list");
|
|---|
| 185 | }
|
|---|
| 186 | $^H |= $charnames::hint_bits;
|
|---|
| 187 | $^H{charnames} = \&charnames ;
|
|---|
| 188 |
|
|---|
| 189 | ##
|
|---|
|
|---|