| [3181] | 1 | package Unicode::UCD;
|
|---|
| 2 |
|
|---|
| 3 | use strict;
|
|---|
| 4 | use warnings;
|
|---|
| 5 |
|
|---|
| 6 | our $VERSION = '0.24';
|
|---|
| 7 |
|
|---|
| 8 | use Storable qw(dclone);
|
|---|
| 9 |
|
|---|
| 10 | require Exporter;
|
|---|
| 11 |
|
|---|
| 12 | our @ISA = qw(Exporter);
|
|---|
| 13 |
|
|---|
| 14 | our @EXPORT_OK = qw(charinfo
|
|---|
| 15 | charblock charscript
|
|---|
| 16 | charblocks charscripts
|
|---|
| 17 | charinrange
|
|---|
| 18 | compexcl
|
|---|
| 19 | casefold casespec
|
|---|
| 20 | namedseq);
|
|---|
| 21 |
|
|---|
| 22 | use Carp;
|
|---|
| 23 |
|
|---|
| 24 | =head1 NAME
|
|---|
| 25 |
|
|---|
| 26 | Unicode::UCD - Unicode character database
|
|---|
| 27 |
|
|---|
| 28 | =head1 SYNOPSIS
|
|---|
| 29 |
|
|---|
| 30 | use Unicode::UCD 'charinfo';
|
|---|
| 31 | my $charinfo = charinfo($codepoint);
|
|---|
| 32 |
|
|---|
| 33 | use Unicode::UCD 'charblock';
|
|---|
| 34 | my $charblock = charblock($codepoint);
|
|---|
| 35 |
|
|---|
| 36 | use Unicode::UCD 'charscript';
|
|---|
| 37 | my $charscript = charscript($codepoint);
|
|---|
| 38 |
|
|---|
| 39 | use Unicode::UCD 'charblocks';
|
|---|
| 40 | my $charblocks = charblocks();
|
|---|
| 41 |
|
|---|
| 42 | use Unicode::UCD 'charscripts';
|
|---|
| 43 | my %charscripts = charscripts();
|
|---|
| 44 |
|
|---|
| 45 | use Unicode::UCD qw(charscript charinrange);
|
|---|
| 46 | my $range = charscript($script);
|
|---|
| 47 | print "looks like $script\n" if charinrange($range, $codepoint);
|
|---|
| 48 |
|
|---|
| 49 | use Unicode::UCD 'compexcl';
|
|---|
| 50 | my $compexcl = compexcl($codepoint);
|
|---|
| 51 |
|
|---|
| 52 | use Unicode::UCD 'namedseq';
|
|---|
| 53 | my $namedseq = namedseq($named_sequence_name);
|
|---|
| 54 |
|
|---|
| 55 | my $unicode_version = Unicode::UCD::UnicodeVersion();
|
|---|
| 56 |
|
|---|
| 57 | =head1 DESCRIPTION
|
|---|
| 58 |
|
|---|
| 59 | The Unicode::UCD module offers a simple interface to the Unicode
|
|---|
| 60 | Character Database.
|
|---|
| 61 |
|
|---|
| 62 | =cut
|
|---|
| 63 |
|
|---|
| 64 | my $UNICODEFH;
|
|---|
| 65 | my $BLOCKSFH;
|
|---|
| 66 | my $SCRIPTSFH;
|
|---|
| 67 | my $VERSIONFH;
|
|---|
| 68 | my $COMPEXCLFH;
|
|---|
| 69 | my $CASEFOLDFH;
|
|---|
| 70 | my $CASESPECFH;
|
|---|
| 71 | my $NAMEDSEQFH;
|
|---|
| 72 |
|
|---|
| 73 | sub openunicode {
|
|---|
| 74 | my ($rfh, @path) = @_;
|
|---|
| 75 | my $f;
|
|---|
| 76 | unless (defined $$rfh) {
|
|---|
| 77 | for my $d (@INC) {
|
|---|
| 78 | use File::Spec;
|
|---|
| 79 | $f = File::Spec->catfile($d, "unicore", @path);
|
|---|
| 80 | last if open($$rfh, $f);
|
|---|
| 81 | undef $f;
|
|---|
| 82 | }
|
|---|
| 83 | croak __PACKAGE__, ": failed to find ",
|
|---|
| 84 | File::Spec->catfile(@path), " in @INC"
|
|---|
| 85 | unless defined $f;
|
|---|
| 86 | }
|
|---|
| 87 | return $f;
|
|---|
| 88 | }
|
|---|
| 89 |
|
|---|
| 90 | =head2 charinfo
|
|---|
| 91 |
|
|---|
| 92 | use Unicode::UCD 'charinfo';
|
|---|
| 93 |
|
|---|
| 94 | my $charinfo = charinfo(0x41);
|
|---|
| 95 |
|
|---|
| 96 | charinfo() returns a reference to a hash that has the following fields
|
|---|
| 97 | as defined by the Unicode standard:
|
|---|
| 98 |
|
|---|
| 99 | key
|
|---|
| 100 |
|
|---|
| 101 | code code point with at least four hexdigits
|
|---|
| 102 | name name of the character IN UPPER CASE
|
|---|
| 103 | category general category of the character
|
|---|
| 104 | combining classes used in the Canonical Ordering Algorithm
|
|---|
| 105 | bidi bidirectional category
|
|---|
| 106 | decomposition character decomposition mapping
|
|---|
| 107 | decimal if decimal digit this is the integer numeric value
|
|---|
| 108 | digit if digit this is the numeric value
|
|---|
| 109 | numeric if numeric is the integer or rational numeric value
|
|---|
| 110 | mirrored if mirrored in bidirectional text
|
|---|
| 111 | unicode10 Unicode 1.0 name if existed and different
|
|---|
| 112 | comment ISO 10646 comment field
|
|---|
| 113 | upper uppercase equivalent mapping
|
|---|
| 114 | lower lowercase equivalent mapping
|
|---|
| 115 | title titlecase equivalent mapping
|
|---|
| 116 |
|
|---|
| 117 | block block the character belongs to (used in \p{In...})
|
|---|
| 118 | script script the character belongs to
|
|---|
| 119 |
|
|---|
| 120 | If no match is found, a reference to an empty hash is returned.
|
|---|
| 121 |
|
|---|
| 122 | The C<block> property is the same as returned by charinfo(). It is
|
|---|
| 123 | not defined in the Unicode Character Database proper (Chapter 4 of the
|
|---|
| 124 | Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
|
|---|
| 125 | (Chapter 14 of TUS3). Similarly for the C<script> property.
|
|---|
| 126 |
|
|---|
| 127 | Note that you cannot do (de)composition and casing based solely on the
|
|---|
| 128 | above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
|
|---|
| 129 | you will need also the compexcl(), casefold(), and casespec() functions.
|
|---|
| 130 |
|
|---|
| 131 | =cut
|
|---|
| 132 |
|
|---|
| 133 | # NB: This function is duplicated in charnames.pm
|
|---|
| 134 | sub _getcode {
|
|---|
| 135 | my $arg = shift;
|
|---|
| 136 |
|
|---|
| 137 | if ($arg =~ /^[1-9]\d*$/) {
|
|---|
| 138 | return $arg;
|
|---|
| 139 | } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
|
|---|
| 140 | return hex($1);
|
|---|
| 141 | }
|
|---|
| 142 |
|
|---|
| 143 | return;
|
|---|
| 144 | }
|
|---|
| 145 |
|
|---|
| 146 | # Lingua::KO::Hangul::Util not part of the standard distribution
|
|---|
| 147 | # but it will be used if available.
|
|---|
| 148 |
|
|---|
| 149 | eval { require Lingua::KO::Hangul::Util };
|
|---|
| 150 | my $hasHangulUtil = ! $@;
|
|---|
| 151 | if ($hasHangulUtil) {
|
|---|
| 152 | Lingua::KO::Hangul::Util->import();
|
|---|
| 153 | }
|
|---|
| 154 |
|
|---|
| 155 | sub hangul_decomp { # internal: called from charinfo
|
|---|
| 156 | if ($hasHangulUtil) {
|
|---|
| 157 | my @tmp = decomposeHangul(shift);
|
|---|
| 158 | return sprintf("%04X %04X", @tmp) if @tmp == 2;
|
|---|
| 159 | return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
|
|---|
| 160 | }
|
|---|
| 161 | return;
|
|---|
| 162 | }
|
|---|
| 163 |
|
|---|
| 164 | sub hangul_charname { # internal: called from charinfo
|
|---|
| 165 | return sprintf("HANGUL SYLLABLE-%04X", shift);
|
|---|
| 166 | }
|
|---|
| 167 |
|
|---|
| 168 | sub han_charname { # internal: called from charinfo
|
|---|
| 169 | return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
|
|---|
| 170 | }
|
|---|
| 171 |
|
|---|
| 172 | my @CharinfoRanges = (
|
|---|
| 173 | # block name
|
|---|
| 174 | # [ first, last, coderef to name, coderef to decompose ],
|
|---|
| 175 | # CJK Ideographs Extension A
|
|---|
| 176 | [ 0x3400, 0x4DB5, \&han_charname, undef ],
|
|---|
| 177 | # CJK Ideographs
|
|---|
| 178 | [ 0x4E00, 0x9FA5, \&han_charname, undef ],
|
|---|
| 179 | # Hangul Syllables
|
|---|
| 180 | [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],
|
|---|
| 181 | # Non-Private Use High Surrogates
|
|---|
| 182 | [ 0xD800, 0xDB7F, undef, undef ],
|
|---|
| 183 | # Private Use High Surrogates
|
|---|
| 184 | [ 0xDB80, 0xDBFF, undef, undef ],
|
|---|
| 185 | # Low Surrogates
|
|---|
| 186 | [ 0xDC00, 0xDFFF, undef, undef ],
|
|---|
| 187 | # The Private Use Area
|
|---|
| 188 | [ 0xE000, 0xF8FF, undef, undef ],
|
|---|
| 189 | # CJK Ideographs Extension B
|
|---|
| 190 | [ 0x20000, 0x2A6D6, \&han_charname, undef ],
|
|---|
| 191 | # Plane 15 Private Use Area
|
|---|
| 192 | [ 0xF0000, 0xFFFFD, undef, undef ],
|
|---|
| 193 | # Plane 16 Private Use Area
|
|---|
| 194 | [ 0x100000, 0x10FFFD, undef, undef ],
|
|---|
| 195 | );
|
|---|
| 196 |
|
|---|
| 197 | sub charinfo {
|
|---|
| 198 | my $arg = shift;
|
|---|
| 199 | my $code = _getcode($arg);
|
|---|
| 200 | croak __PACKAGE__, "::charinfo: unknown code '$arg'"
|
|---|
| 201 | unless defined $code;
|
|---|
| 202 | my $hexk = sprintf("%06X", $code);
|
|---|
| 203 | my($rcode,$rname,$rdec);
|
|---|
| 204 | foreach my $range (@CharinfoRanges){
|
|---|
| 205 | if ($range->[0] <= $code && $code <= $range->[1]) {
|
|---|
| 206 | $rcode = $hexk;
|
|---|
| 207 | $rcode =~ s/^0+//;
|
|---|
| 208 | $rcode = sprintf("%04X", hex($rcode));
|
|---|
| 209 | $rname = $range->[2] ? $range->[2]->($code) : '';
|
|---|
| 210 | $rdec = $range->[3] ? $range->[3]->($code) : '';
|
|---|
| 211 | $hexk = sprintf("%06X", $range->[0]); # replace by the first
|
|---|
| 212 | last;
|
|---|
| 213 | }
|
|---|
| 214 | }
|
|---|
| 215 | openunicode(\$UNICODEFH, "UnicodeData.txt");
|
|---|
| 216 | if (defined $UNICODEFH) {
|
|---|
| 217 | use Search::Dict 1.02;
|
|---|
| 218 | if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
|
|---|
| 219 | my $line = <$UNICODEFH>;
|
|---|
| 220 | return unless defined $line;
|
|---|
| 221 | chomp $line;
|
|---|
| 222 | my %prop;
|
|---|
| 223 | @prop{qw(
|
|---|
| 224 | code name category
|
|---|
| 225 | combining bidi decomposition
|
|---|
| 226 | decimal digit numeric
|
|---|
| 227 | mirrored unicode10 comment
|
|---|
| 228 | upper lower title
|
|---|
| 229 | )} = split(/;/, $line, -1);
|
|---|
| 230 | $hexk =~ s/^0+//;
|
|---|
| 231 | $hexk = sprintf("%04X", hex($hexk));
|
|---|
| 232 | if ($prop{code} eq $hexk) {
|
|---|
| 233 | $prop{block} = charblock($code);
|
|---|
| 234 | $prop{script} = charscript($code);
|
|---|
| 235 | if(defined $rname){
|
|---|
| 236 | $prop{code} = $rcode;
|
|---|
| 237 | $prop{name} = $rname;
|
|---|
| 238 | $prop{decomposition} = $rdec;
|
|---|
| 239 | }
|
|---|
| 240 | return \%prop;
|
|---|
| 241 | }
|
|---|
| 242 | }
|
|---|
| 243 | }
|
|---|
| 244 | return;
|
|---|
| 245 | }
|
|---|
| 246 |
|
|---|
| 247 | sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
|
|---|
| 248 | my ($table, $lo, $hi, $code) = @_;
|
|---|
| 249 |
|
|---|
| 250 | return if $lo > $hi;
|
|---|
| 251 |
|
|---|
| 252 | my $mid = int(($lo+$hi) / 2);
|
|---|
| 253 |
|
|---|
| 254 | if ($table->[$mid]->[0] < $code) {
|
|---|
| 255 | if ($table->[$mid]->[1] >= $code) {
|
|---|
| 256 | return $table->[$mid]->[2];
|
|---|
| 257 | } else {
|
|---|
| 258 | _search($table, $mid + 1, $hi, $code);
|
|---|
| 259 | }
|
|---|
| 260 | } elsif ($table->[$mid]->[0] > $code) {
|
|---|
| 261 | _search($table, $lo, $mid - 1, $code);
|
|---|
| 262 | } else {
|
|---|
| 263 | return $table->[$mid]->[2];
|
|---|
| 264 | }
|
|---|
| 265 | }
|
|---|
| 266 |
|
|---|
| 267 | sub charinrange {
|
|---|
| 268 | my ($range, $arg) = @_;
|
|---|
| 269 | my $code = _getcode($arg);
|
|---|
| 270 | croak __PACKAGE__, "::charinrange: unknown code '$arg'"
|
|---|
| 271 | unless defined $code;
|
|---|
| 272 | _search($range, 0, $#$range, $code);
|
|---|
| 273 | }
|
|---|
| 274 |
|
|---|
| 275 | =head2 charblock
|
|---|
| 276 |
|
|---|
| 277 | use Unicode::UCD 'charblock';
|
|---|
| 278 |
|
|---|
| 279 | my $charblock = charblock(0x41);
|
|---|
| 280 | my $charblock = charblock(1234);
|
|---|
| 281 | my $charblock = charblock("0x263a");
|
|---|
| 282 | my $charblock = charblock("U+263a");
|
|---|
| 283 |
|
|---|
| 284 | my $range = charblock('Armenian');
|
|---|
| 285 |
|
|---|
| 286 | With a B<code point argument> charblock() returns the I<block> the character
|
|---|
| 287 | belongs to, e.g. C<Basic Latin>. Note that not all the character
|
|---|
| 288 | positions within all blocks are defined.
|
|---|
| 289 |
|
|---|
| 290 | See also L</Blocks versus Scripts>.
|
|---|
| 291 |
|
|---|
| 292 | If supplied with an argument that can't be a code point, charblock() tries
|
|---|
| 293 | to do the opposite and interpret the argument as a character block. The
|
|---|
| 294 | return value is a I<range>: an anonymous list of lists that contain
|
|---|
| 295 | I<start-of-range>, I<end-of-range> code point pairs. You can test whether
|
|---|
| 296 | a code point is in a range using the L</charinrange> function. If the
|
|---|
| 297 | argument is not a known character block, C<undef> is returned.
|
|---|
| 298 |
|
|---|
| 299 | =cut
|
|---|
| 300 |
|
|---|
| 301 | my @BLOCKS;
|
|---|
| 302 | my %BLOCKS;
|
|---|
| 303 |
|
|---|
| 304 | sub _charblocks {
|
|---|
| 305 | unless (@BLOCKS) {
|
|---|
| 306 | if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
|
|---|
| 307 | local $_;
|
|---|
| 308 | while (<$BLOCKSFH>) {
|
|---|
| 309 | if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
|
|---|
| 310 | my ($lo, $hi) = (hex($1), hex($2));
|
|---|
| 311 | my $subrange = [ $lo, $hi, $3 ];
|
|---|
| 312 | push @BLOCKS, $subrange;
|
|---|
| 313 | push @{$BLOCKS{$3}}, $subrange;
|
|---|
| 314 | }
|
|---|
| 315 | }
|
|---|
| 316 | close($BLOCKSFH);
|
|---|
| 317 | }
|
|---|
| 318 | }
|
|---|
| 319 | }
|
|---|
| 320 |
|
|---|
| 321 | sub charblock {
|
|---|
| 322 | my $arg = shift;
|
|---|
| 323 |
|
|---|
| 324 | _charblocks() unless @BLOCKS;
|
|---|
| 325 |
|
|---|
| 326 | my $code = _getcode($arg);
|
|---|
| 327 |
|
|---|
| 328 | if (defined $code) {
|
|---|
| 329 | _search(\@BLOCKS, 0, $#BLOCKS, $code);
|
|---|
| 330 | } else {
|
|---|
| 331 | if (exists $BLOCKS{$arg}) {
|
|---|
| 332 | return dclone $BLOCKS{$arg};
|
|---|
| 333 | } else {
|
|---|
| 334 | return;
|
|---|
| 335 | }
|
|---|
| 336 | }
|
|---|
| 337 | }
|
|---|
| 338 |
|
|---|
| 339 | =head2 charscript
|
|---|
| 340 |
|
|---|
| 341 | use Unicode::UCD 'charscript';
|
|---|
| 342 |
|
|---|
| 343 | my $charscript = charscript(0x41);
|
|---|
| 344 | my $charscript = charscript(1234);
|
|---|
| 345 | my $charscript = charscript("U+263a");
|
|---|
| 346 |
|
|---|
| 347 | my $range = charscript('Thai');
|
|---|
| 348 |
|
|---|
| 349 | With a B<code point argument> charscript() returns the I<script> the
|
|---|
| 350 | character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
|
|---|
| 351 |
|
|---|
| 352 | See also L</Blocks versus Scripts>.
|
|---|
| 353 |
|
|---|
| 354 | If supplied with an argument that can't be a code point, charscript() tries
|
|---|
| 355 | to do the opposite and interpret the argument as a character script. The
|
|---|
| 356 | return value is a I<range>: an anonymous list of lists that contain
|
|---|
| 357 | I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
|
|---|
| 358 | code point is in a range using the L</charinrange> function. If the
|
|---|
| 359 | argument is not a known character script, C<undef> is returned.
|
|---|
| 360 |
|
|---|
| 361 | =cut
|
|---|
| 362 |
|
|---|
| 363 | my @SCRIPTS;
|
|---|
| 364 | my %SCRIPTS;
|
|---|
| 365 |
|
|---|
| 366 | sub _charscripts {
|
|---|
| 367 | unless (@SCRIPTS) {
|
|---|
| 368 | if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
|
|---|
| 369 | local $_;
|
|---|
| 370 | while (<$SCRIPTSFH>) {
|
|---|
| 371 | if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
|
|---|
| 372 | my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
|
|---|
| 373 | my $script = lc($3);
|
|---|
| 374 | $script =~ s/\b(\w)/uc($1)/ge;
|
|---|
| 375 | my $subrange = [ $lo, $hi, $script ];
|
|---|
| 376 | push @SCRIPTS, $subrange;
|
|---|
| 377 | push @{$SCRIPTS{$script}}, $subrange;
|
|---|
| 378 | }
|
|---|
| 379 | }
|
|---|
| 380 | close($SCRIPTSFH);
|
|---|
| 381 | @SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
|
|---|
| 382 | }
|
|---|
| 383 | }
|
|---|
| 384 | }
|
|---|
| 385 |
|
|---|
| 386 | sub charscript {
|
|---|
| 387 | my $arg = shift;
|
|---|
| 388 |
|
|---|
| 389 | _charscripts() unless @SCRIPTS;
|
|---|
| 390 |
|
|---|
| 391 | my $code = _getcode($arg);
|
|---|
| 392 |
|
|---|
| 393 | if (defined $code) {
|
|---|
| 394 | _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
|
|---|
| 395 | } else {
|
|---|
| 396 | if (exists $SCRIPTS{$arg}) {
|
|---|
| 397 | return dclone $SCRIPTS{$arg};
|
|---|
| 398 | } else {
|
|---|
| 399 | return;
|
|---|
| 400 | }
|
|---|
| 401 | }
|
|---|
| 402 | }
|
|---|
| 403 |
|
|---|
| 404 | =head2 charblocks
|
|---|
| 405 |
|
|---|
| 406 | use Unicode::UCD 'charblocks';
|
|---|
| 407 |
|
|---|
| 408 | my $charblocks = charblocks();
|
|---|
| 409 |
|
|---|
| 410 | charblocks() returns a reference to a hash with the known block names
|
|---|
| 411 | as the keys, and the code point ranges (see L</charblock>) as the values.
|
|---|
| 412 |
|
|---|
| 413 | See also L</Blocks versus Scripts>.
|
|---|
| 414 |
|
|---|
| 415 | =cut
|
|---|
| 416 |
|
|---|
| 417 | sub charblocks {
|
|---|
| 418 | _charblocks() unless %BLOCKS;
|
|---|
| 419 | return dclone \%BLOCKS;
|
|---|
| 420 | }
|
|---|
| 421 |
|
|---|
| 422 | =head2 charscripts
|
|---|
| 423 |
|
|---|
| 424 | use Unicode::UCD 'charscripts';
|
|---|
| 425 |
|
|---|
| 426 | my %charscripts = charscripts();
|
|---|
| 427 |
|
|---|
| 428 | charscripts() returns a hash with the known script names as the keys,
|
|---|
| 429 | and the code point ranges (see L</charscript>) as the values.
|
|---|
| 430 |
|
|---|
| 431 | See also L</Blocks versus Scripts>.
|
|---|
| 432 |
|
|---|
| 433 | =cut
|
|---|
| 434 |
|
|---|
| 435 | sub charscripts {
|
|---|
| 436 | _charscripts() unless %SCRIPTS;
|
|---|
| 437 | return dclone \%SCRIPTS;
|
|---|
| 438 | }
|
|---|
| 439 |
|
|---|
| 440 | =head2 Blocks versus Scripts
|
|---|
| 441 |
|
|---|
| 442 | The difference between a block and a script is that scripts are closer
|
|---|
| 443 | to the linguistic notion of a set of characters required to present
|
|---|
| 444 | languages, while block is more of an artifact of the Unicode character
|
|---|
| 445 | numbering and separation into blocks of (mostly) 256 characters.
|
|---|
| 446 |
|
|---|
| 447 | For example the Latin B<script> is spread over several B<blocks>, such
|
|---|
| 448 | as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
|
|---|
| 449 | C<Latin Extended-B>. On the other hand, the Latin script does not
|
|---|
| 450 | contain all the characters of the C<Basic Latin> block (also known as
|
|---|
| 451 | the ASCII): it includes only the letters, and not, for example, the digits
|
|---|
| 452 | or the punctuation.
|
|---|
| 453 |
|
|---|
| 454 | For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
|
|---|
| 455 |
|
|---|
| 456 | For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
|
|---|
| 457 |
|
|---|
| 458 | =head2 Matching Scripts and Blocks
|
|---|
| 459 |
|
|---|
| 460 | Scripts are matched with the regular-expression construct
|
|---|
| 461 | C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
|
|---|
| 462 | while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
|
|---|
| 463 | any of the 256 code points in the Tibetan block).
|
|---|
| 464 |
|
|---|
| 465 | =head2 Code Point Arguments
|
|---|
| 466 |
|
|---|
| 467 | A I<code point argument> is either a decimal or a hexadecimal scalar
|
|---|
| 468 | designating a Unicode character, or C<U+> followed by hexadecimals
|
|---|
| 469 | designating a Unicode character. In other words, if you want a code
|
|---|
| 470 | point to be interpreted as a hexadecimal number, you must prefix it
|
|---|
| 471 | with either C<0x> or C<U+>, because a string like e.g. C<123> will
|
|---|
| 472 | be interpreted as a decimal code point. Also note that Unicode is
|
|---|
| 473 | B<not> limited to 16 bits (the number of Unicode characters is
|
|---|
| 474 | open-ended, in theory unlimited): you may have more than 4 hexdigits.
|
|---|
| 475 |
|
|---|
| 476 | =head2 charinrange
|
|---|
| 477 |
|
|---|
| 478 | In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
|
|---|
| 479 | can also test whether a code point is in the I<range> as returned by
|
|---|
| 480 | L</charblock> and L</charscript> or as the values of the hash returned
|
|---|
| 481 | by L</charblocks> and L</charscripts> by using charinrange():
|
|---|
| 482 |
|
|---|
| 483 | use Unicode::UCD qw(charscript charinrange);
|
|---|
| 484 |
|
|---|
| 485 | $range = charscript('Hiragana');
|
|---|
| 486 | print "looks like hiragana\n" if charinrange($range, $codepoint);
|
|---|
| 487 |
|
|---|
| 488 | =cut
|
|---|
| 489 |
|
|---|
| 490 | =head2 compexcl
|
|---|
| 491 |
|
|---|
| 492 | use Unicode::UCD 'compexcl';
|
|---|
| 493 |
|
|---|
| 494 | my $compexcl = compexcl("09dc");
|
|---|
| 495 |
|
|---|
| 496 | The compexcl() returns the composition exclusion (that is, if the
|
|---|
| 497 | character should not be produced during a precomposition) of the
|
|---|
| 498 | character specified by a B<code point argument>.
|
|---|
| 499 |
|
|---|
| 500 | If there is a composition exclusion for the character, true is
|
|---|
| 501 | returned. Otherwise, false is returned.
|
|---|
| 502 |
|
|---|
| 503 | =cut
|
|---|
| 504 |
|
|---|
| 505 | my %COMPEXCL;
|
|---|
| 506 |
|
|---|
| 507 | sub _compexcl {
|
|---|
| 508 | unless (%COMPEXCL) {
|
|---|
| 509 | if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
|
|---|
| 510 | local $_;
|
|---|
| 511 | while (<$COMPEXCLFH>) {
|
|---|
| 512 | if (/^([0-9A-F]+)\s+\#\s+/) {
|
|---|
| 513 | my $code = hex($1);
|
|---|
| 514 | $COMPEXCL{$code} = undef;
|
|---|
| 515 | }
|
|---|
| 516 | }
|
|---|
| 517 | close($COMPEXCLFH);
|
|---|
| 518 | }
|
|---|
| 519 | }
|
|---|
| 520 | }
|
|---|
| 521 |
|
|---|
| 522 | sub compexcl {
|
|---|
| 523 | my $arg = shift;
|
|---|
| 524 | my $code = _getcode($arg);
|
|---|
| 525 | croak __PACKAGE__, "::compexcl: unknown code '$arg'"
|
|---|
| 526 | unless defined $code;
|
|---|
| 527 |
|
|---|
| 528 | _compexcl() unless %COMPEXCL;
|
|---|
| 529 |
|
|---|
| 530 | return exists $COMPEXCL{$code};
|
|---|
| 531 | }
|
|---|
| 532 |
|
|---|
| 533 | =head2 casefold
|
|---|
| 534 |
|
|---|
| 535 | use Unicode::UCD 'casefold';
|
|---|
| 536 |
|
|---|
| 537 | my $casefold = casefold("00DF");
|
|---|
| 538 |
|
|---|
| 539 | The casefold() returns the locale-independent case folding of the
|
|---|
| 540 | character specified by a B<code point argument>.
|
|---|
| 541 |
|
|---|
| 542 | If there is a case folding for that character, a reference to a hash
|
|---|
| 543 | with the following fields is returned:
|
|---|
| 544 |
|
|---|
| 545 | key
|
|---|
| 546 |
|
|---|
| 547 | code code point with at least four hexdigits
|
|---|
| 548 | status "C", "F", "S", or "I"
|
|---|
| 549 | mapping one or more codes separated by spaces
|
|---|
| 550 |
|
|---|
| 551 | The meaning of the I<status> is as follows:
|
|---|
| 552 |
|
|---|
| 553 | C common case folding, common mappings shared
|
|---|
| 554 | by both simple and full mappings
|
|---|
| 555 | F full case folding, mappings that cause strings
|
|---|
| 556 | to grow in length. Multiple characters are separated
|
|---|
| 557 | by spaces
|
|---|
| 558 | S simple case folding, mappings to single characters
|
|---|
| 559 | where different from F
|
|---|
| 560 | I special case for dotted uppercase I and
|
|---|
| 561 | dotless lowercase i
|
|---|
| 562 | - If this mapping is included, the result is
|
|---|
| 563 | case-insensitive, but dotless and dotted I's
|
|---|
| 564 | are not distinguished
|
|---|
| 565 | - If this mapping is excluded, the result is not
|
|---|
| 566 | fully case-insensitive, but dotless and dotted
|
|---|
| 567 | I's are distinguished
|
|---|
| 568 |
|
|---|
| 569 | If there is no case folding for that character, C<undef> is returned.
|
|---|
| 570 |
|
|---|
| 571 | For more information about case mappings see
|
|---|
| 572 | http://www.unicode.org/unicode/reports/tr21/
|
|---|
| 573 |
|
|---|
| 574 | =cut
|
|---|
| 575 |
|
|---|
| 576 | my %CASEFOLD;
|
|---|
| 577 |
|
|---|
| 578 | sub _casefold {
|
|---|
| 579 | unless (%CASEFOLD) {
|
|---|
| 580 | if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
|
|---|
| 581 | local $_;
|
|---|
| 582 | while (<$CASEFOLDFH>) {
|
|---|
| 583 | if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
|
|---|
| 584 | my $code = hex($1);
|
|---|
| 585 | $CASEFOLD{$code} = { code => $1,
|
|---|
| 586 | status => $2,
|
|---|
| 587 | mapping => $3 };
|
|---|
| 588 | }
|
|---|
| 589 | }
|
|---|
| 590 | close($CASEFOLDFH);
|
|---|
| 591 | }
|
|---|
| 592 | }
|
|---|
| 593 | }
|
|---|
| 594 |
|
|---|
| 595 | sub casefold {
|
|---|
| 596 | my $arg = shift;
|
|---|
| 597 | my $code = _getcode($arg);
|
|---|
| 598 | croak __PACKAGE__, "::casefold: unknown code '$arg'"
|
|---|
| 599 | unless defined $code;
|
|---|
| 600 |
|
|---|
| 601 | _casefold() unless %CASEFOLD;
|
|---|
| 602 |
|
|---|
| 603 | return $CASEFOLD{$code};
|
|---|
| 604 | }
|
|---|
| 605 |
|
|---|
| 606 | =head2 casespec
|
|---|
| 607 |
|
|---|
| 608 | use Unicode::UCD 'casespec';
|
|---|
| 609 |
|
|---|
| 610 | my $casespec = casespec("FB00");
|
|---|
| 611 |
|
|---|
| 612 | The casespec() returns the potentially locale-dependent case mapping
|
|---|
| 613 | of the character specified by a B<code point argument>. The mapping
|
|---|
| 614 | may change the length of the string (which the basic Unicode case
|
|---|
| 615 | mappings as returned by charinfo() never do).
|
|---|
| 616 |
|
|---|
| 617 | If there is a case folding for that character, a reference to a hash
|
|---|
| 618 | with the following fields is returned:
|
|---|
| 619 |
|
|---|
| 620 | key
|
|---|
| 621 |
|
|---|
| 622 | code code point with at least four hexdigits
|
|---|
| 623 | lower lowercase
|
|---|
| 624 | title titlecase
|
|---|
| 625 | upper uppercase
|
|---|
| 626 | condition condition list (may be undef)
|
|---|
| 627 |
|
|---|
| 628 | The C<condition> is optional. Where present, it consists of one or
|
|---|
| 629 | more I<locales> or I<contexts>, separated by spaces (other than as
|
|---|
| 630 | used to separate elements, spaces are to be ignored). A condition
|
|---|
| 631 | list overrides the normal behavior if all of the listed conditions are
|
|---|
| 632 | true. Case distinctions in the condition list are not significant.
|
|---|
| 633 | Conditions preceded by "NON_" represent the negation of the condition.
|
|---|
| 634 |
|
|---|
| 635 | Note that when there are multiple case folding definitions for a
|
|---|
| 636 | single code point because of different locales, the value returned by
|
|---|
| 637 | casespec() is a hash reference which has the locales as the keys and
|
|---|
| 638 | hash references as described above as the values.
|
|---|
| 639 |
|
|---|
| 640 | A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
|
|---|
| 641 | followed by a "_" and a 2-letter ISO language code (possibly followed
|
|---|
| 642 | by a "_" and a variant code). You can find the lists of those codes,
|
|---|
| 643 | see L<Locale::Country> and L<Locale::Language>.
|
|---|
| 644 |
|
|---|
| 645 | A I<context> is one of the following choices:
|
|---|
| 646 |
|
|---|
| 647 | FINAL The letter is not followed by a letter of
|
|---|
| 648 | general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
|
|---|
| 649 | MODERN The mapping is only used for modern text
|
|---|
| 650 | AFTER_i The last base character was "i" (U+0069)
|
|---|
| 651 |
|
|---|
| 652 | For more information about case mappings see
|
|---|
| 653 | http://www.unicode.org/unicode/reports/tr21/
|
|---|
| 654 |
|
|---|
| 655 | =cut
|
|---|
| 656 |
|
|---|
| 657 | my %CASESPEC;
|
|---|
| 658 |
|
|---|
| 659 | sub _casespec {
|
|---|
| 660 | unless (%CASESPEC) {
|
|---|
| 661 | if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
|
|---|
| 662 | local $_;
|
|---|
| 663 | while (<$CASESPECFH>) {
|
|---|
| 664 | if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
|
|---|
| 665 | my ($hexcode, $lower, $title, $upper, $condition) =
|
|---|
| 666 | ($1, $2, $3, $4, $5);
|
|---|
| 667 | my $code = hex($hexcode);
|
|---|
| 668 | if (exists $CASESPEC{$code}) {
|
|---|
| 669 | if (exists $CASESPEC{$code}->{code}) {
|
|---|
| 670 | my ($oldlower,
|
|---|
| 671 | $oldtitle,
|
|---|
| 672 | $oldupper,
|
|---|
| 673 | $oldcondition) =
|
|---|
| 674 | @{$CASESPEC{$code}}{qw(lower
|
|---|
| 675 | title
|
|---|
| 676 | upper
|
|---|
| 677 | condition)};
|
|---|
| 678 | if (defined $oldcondition) {
|
|---|
| 679 | my ($oldlocale) =
|
|---|
| 680 | ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
|
|---|
| 681 | delete $CASESPEC{$code};
|
|---|
| 682 | $CASESPEC{$code}->{$oldlocale} =
|
|---|
| 683 | { code => $hexcode,
|
|---|
| 684 | lower => $oldlower,
|
|---|
| 685 | title => $oldtitle,
|
|---|
| 686 | upper => $oldupper,
|
|---|
| 687 | condition => $oldcondition };
|
|---|
| 688 | }
|
|---|
| 689 | }
|
|---|
| 690 | my ($locale) =
|
|---|
| 691 | ($condition =~ /^([a-z][a-z](?:_\S+)?)/);
|
|---|
| 692 | $CASESPEC{$code}->{$locale} =
|
|---|
| 693 | { code => $hexcode,
|
|---|
| 694 | lower => $lower,
|
|---|
| 695 | title => $title,
|
|---|
| 696 | upper => $upper,
|
|---|
| 697 | condition => $condition };
|
|---|
| 698 | } else {
|
|---|
| 699 | $CASESPEC{$code} =
|
|---|
| 700 | { code => $hexcode,
|
|---|
| 701 | lower => $lower,
|
|---|
| 702 | title => $title,
|
|---|
| 703 | upper => $upper,
|
|---|
| 704 | condition => $condition };
|
|---|
| 705 | }
|
|---|
| 706 | }
|
|---|
| 707 | }
|
|---|
| 708 | close($CASESPECFH);
|
|---|
| 709 | }
|
|---|
| 710 | }
|
|---|
| 711 | }
|
|---|
| 712 |
|
|---|
| 713 | sub casespec {
|
|---|
| 714 | my $arg = shift;
|
|---|
| 715 | my $code = _getcode($arg);
|
|---|
| 716 | croak __PACKAGE__, "::casespec: unknown code '$arg'"
|
|---|
| 717 | unless defined $code;
|
|---|
| 718 |
|
|---|
| 719 | _casespec() unless %CASESPEC;
|
|---|
| 720 |
|
|---|
| 721 | return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
|
|---|
| 722 | }
|
|---|
| 723 |
|
|---|
| 724 | =head2 namedseq()
|
|---|
| 725 |
|
|---|
| 726 | use Unicode::UCD 'namedseq';
|
|---|
| 727 |
|
|---|
| 728 | my $namedseq = namedseq("KATAKANA LETTER AINU P");
|
|---|
| 729 | my @namedseq = namedseq("KATAKANA LETTER AINU P");
|
|---|
| 730 | my %namedseq = namedseq();
|
|---|
| 731 |
|
|---|
| 732 | If used with a single argument in a scalar context, returns the string
|
|---|
| 733 | consisting of the code points of the named sequence, or C<undef> if no
|
|---|
| 734 | named sequence by that name exists. If used with a single argument in
|
|---|
| 735 | a list context, returns list of the code points. If used with no
|
|---|
| 736 | arguments in a list context, returns a hash with the names of the
|
|---|
| 737 | named sequences as the keys and the named sequences as strings as
|
|---|
| 738 | the values. Otherwise, returns C<undef> or empty list depending
|
|---|
| 739 | on the context.
|
|---|
| 740 |
|
|---|
| 741 | (New from Unicode 4.1.0)
|
|---|
| 742 |
|
|---|
| 743 | =cut
|
|---|
| 744 |
|
|---|
| 745 | my %NAMEDSEQ;
|
|---|
| 746 |
|
|---|
| 747 | sub _namedseq {
|
|---|
| 748 | unless (%NAMEDSEQ) {
|
|---|
| 749 | if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) {
|
|---|
| 750 | local $_;
|
|---|
| 751 | while (<$NAMEDSEQFH>) {
|
|---|
| 752 | if (/^(.+)\s*;\s*([0-9A-F]+(?: [0-9A-F]+)*)$/) {
|
|---|
| 753 | my ($n, $s) = ($1, $2);
|
|---|
| 754 | my @s = map { chr(hex($_)) } split(' ', $s);
|
|---|
| 755 | $NAMEDSEQ{$n} = join("", @s);
|
|---|
| 756 | }
|
|---|
| 757 | }
|
|---|
| 758 | close($NAMEDSEQFH);
|
|---|
| 759 | }
|
|---|
| 760 | }
|
|---|
| 761 | }
|
|---|
| 762 |
|
|---|
| 763 | sub namedseq {
|
|---|
| 764 | _namedseq() unless %NAMEDSEQ;
|
|---|
| 765 | my $wantarray = wantarray();
|
|---|
| 766 | if (defined $wantarray) {
|
|---|
| 767 | if ($wantarray) {
|
|---|
| 768 | if (@_ == 0) {
|
|---|
| 769 | return %NAMEDSEQ;
|
|---|
| 770 | } elsif (@_ == 1) {
|
|---|
| 771 | my $s = $NAMEDSEQ{ $_[0] };
|
|---|
| 772 | return defined $s ? map { ord($_) } split('', $s) : ();
|
|---|
| 773 | }
|
|---|
| 774 | } elsif (@_ == 1) {
|
|---|
| 775 | return $NAMEDSEQ{ $_[0] };
|
|---|
| 776 | }
|
|---|
| 777 | }
|
|---|
| 778 | return;
|
|---|
| 779 | }
|
|---|
| 780 |
|
|---|
| 781 | =head2 Unicode::UCD::UnicodeVersion
|
|---|
| 782 |
|
|---|
| 783 | Unicode::UCD::UnicodeVersion() returns the version of the Unicode
|
|---|
| 784 | Character Database, in other words, the version of the Unicode
|
|---|
| 785 | standard the database implements. The version is a string
|
|---|
| 786 | of numbers delimited by dots (C<'.'>).
|
|---|
| 787 |
|
|---|
| 788 | =cut
|
|---|
| 789 |
|
|---|
| 790 | my $UNICODEVERSION;
|
|---|
| 791 |
|
|---|
| 792 | sub UnicodeVersion {
|
|---|
| 793 | unless (defined $UNICODEVERSION) {
|
|---|
| 794 | openunicode(\$VERSIONFH, "version");
|
|---|
| 795 | chomp($UNICODEVERSION = <$VERSIONFH>);
|
|---|
| 796 | close($VERSIONFH);
|
|---|
| 797 | croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
|
|---|
| 798 | unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
|
|---|
| 799 | }
|
|---|
| 800 | return $UNICODEVERSION;
|
|---|
| 801 | }
|
|---|
| 802 |
|
|---|
| 803 | =head2 Implementation Note
|
|---|
| 804 |
|
|---|
| 805 | The first use of charinfo() opens a read-only filehandle to the Unicode
|
|---|
| 806 | Character Database (the database is included in the Perl distribution).
|
|---|
| 807 | The filehandle is then kept open for further queries. In other words,
|
|---|
| 808 | if you are wondering where one of your filehandles went, that's where.
|
|---|
| 809 |
|
|---|
| 810 | =head1 BUGS
|
|---|
| 811 |
|
|---|
| 812 | Does not yet support EBCDIC platforms.
|
|---|
| 813 |
|
|---|
| 814 | =head1 AUTHOR
|
|---|
| 815 |
|
|---|
| 816 | Jarkko Hietaniemi
|
|---|
| 817 |
|
|---|
| 818 | =cut
|
|---|
| 819 |
|
|---|
| 820 | 1;
|
|---|