| 1 | package Unicode::UCD;
|
|---|
| 2 |
|
|---|
| 3 | use strict;
|
|---|
| 4 | use warnings;
|
|---|
| 5 |
|
|---|
| 6 | our $VERSION = '0.24';
|
|---|
| 7 |
|
|---|
| 8 | use Storable qw(dclone);
|
|---|
| 9 |
|
|---|
| 10 | require Exporter;
|
|---|
| 11 |
|
|---|
| 12 | our @ISA = qw(Exporter);
|
|---|
| 13 |
|
|---|
| 14 | our @EXPORT_OK = qw(charinfo
|
|---|
| 15 | charblock charscript
|
|---|
| 16 | charblocks charscripts
|
|---|
| 17 | charinrange
|
|---|
| 18 | compexcl
|
|---|
| 19 | casefold casespec
|
|---|
| 20 | namedseq);
|
|---|
| 21 |
|
|---|
| 22 | use Carp;
|
|---|
| 23 |
|
|---|
| 24 | =head1 NAME
|
|---|
| 25 |
|
|---|
| 26 | Unicode::UCD - Unicode character database
|
|---|
| 27 |
|
|---|
| 28 | =head1 SYNOPSIS
|
|---|
| 29 |
|
|---|
| 30 | use Unicode::UCD 'charinfo';
|
|---|
| 31 | my $charinfo = charinfo($codepoint);
|
|---|
| 32 |
|
|---|
| 33 | use Unicode::UCD 'charblock';
|
|---|
| 34 | my $charblock = charblock($codepoint);
|
|---|
| 35 |
|
|---|
| 36 | use Unicode::UCD 'charscript';
|
|---|
| 37 | my $charscript = charscript($codepoint);
|
|---|
| 38 |
|
|---|
| 39 | use Unicode::UCD 'charblocks';
|
|---|
| 40 | my $charblocks = charblocks();
|
|---|
| 41 |
|
|---|
| 42 | use Unicode::UCD 'charscripts';
|
|---|
| 43 | my %charscripts = charscripts();
|
|---|
| 44 |
|
|---|
| 45 | use Unicode::UCD qw(charscript charinrange);
|
|---|
| 46 | my $range = charscript($script);
|
|---|
| 47 | print "looks like $script\n" if charinrange($range, $codepoint);
|
|---|
| 48 |
|
|---|
| 49 | use Unicode::UCD 'compexcl';
|
|---|
| 50 | my $compexcl = compexcl($codepoint);
|
|---|
| 51 |
|
|---|
| 52 | use Unicode::UCD 'namedseq';
|
|---|
| 53 | my $namedseq = namedseq($named_sequence_name);
|
|---|
| 54 |
|
|---|
| 55 | my $unicode_version = Unicode::UCD::UnicodeVersion();
|
|---|
| 56 |
|
|---|
| 57 | =head1 DESCRIPTION
|
|---|
| 58 |
|
|---|
| 59 | The Unicode::UCD module offers a simple interface to the Unicode
|
|---|
| 60 | Character Database.
|
|---|
| 61 |
|
|---|
| 62 | =cut
|
|---|
| 63 |
|
|---|
| 64 | my $UNICODEFH;
|
|---|
| 65 | my $BLOCKSFH;
|
|---|
| 66 | my $SCRIPTSFH;
|
|---|
| 67 | my $VERSIONFH;
|
|---|
| 68 | my $COMPEXCLFH;
|
|---|
| 69 | my $CASEFOLDFH;
|
|---|
| 70 | my $CASESPECFH;
|
|---|
| 71 | my $NAMEDSEQFH;
|
|---|
| 72 |
|
|---|
| 73 | sub openunicode {
|
|---|
| 74 | my ($rfh, @path) = @_;
|
|---|
| 75 | my $f;
|
|---|
| 76 | unless (defined $$rfh) {
|
|---|
| 77 | for my $d (@INC) {
|
|---|
| 78 | use File::Spec;
|
|---|
| 79 | $f = File::Spec->catfile($d, "unicore", @path);
|
|---|
| 80 | last if open($$rfh, $f);
|
|---|
| 81 | undef $f;
|
|---|
| 82 | }
|
|---|
| 83 | croak __PACKAGE__, ": failed to find ",
|
|---|
| 84 | File::Spec->catfile(@path), " in @INC"
|
|---|
| 85 | unless defined $f;
|
|---|
| 86 | }
|
|---|
| 87 | return $f;
|
|---|
| 88 | }
|
|---|
| 89 |
|
|---|
| 90 | =head2 charinfo
|
|---|
| 91 |
|
|---|
| 92 | use Unicode::UCD 'charinfo';
|
|---|
| 93 |
|
|---|
| 94 | my $charinfo = charinfo(0x41);
|
|---|
| 95 |
|
|---|
| 96 | charinfo() returns a reference to a hash that has the following fields
|
|---|
| 97 | as defined by the Unicode standard:
|
|---|
| 98 |
|
|---|
| 99 | key
|
|---|
| 100 |
|
|---|
| 101 | code code point with at least four hexdigits
|
|---|
| 102 | name name of the character IN UPPER CASE
|
|---|
| 103 | category general category of the character
|
|---|
| 104 | combining classes used in the Canonical Ordering Algorithm
|
|---|
| 105 | bidi bidirectional category
|
|---|
| 106 | decomposition character decomposition mapping
|
|---|
| 107 | decimal if decimal digit this is the integer numeric value
|
|---|
| 108 | digit if digit this is the numeric value
|
|---|
| 109 | numeric if numeric is the integer or rational numeric value
|
|---|
| 110 | mirrored if mirrored in bidirectional text
|
|---|
| 111 | unicode10 Unicode 1.0 name if existed and different
|
|---|
| 112 | comment ISO 10646 comment field
|
|---|
| 113 | upper uppercase equivalent mapping
|
|---|
| 114 | lower lowercase equivalent mapping
|
|---|
| 115 | title titlecase equivalent mapping
|
|---|
| 116 |
|
|---|
| 117 | block block the character belongs to (used in \p{In...})
|
|---|
| 118 | script script the character belongs to
|
|---|
| 119 |
|
|---|
| 120 | If no match is found, a reference to an empty hash is returned.
|
|---|
| 121 |
|
|---|
| 122 | The C<block> property is the same as returned by charinfo(). It is
|
|---|
| 123 | not defined in the Unicode Character Database proper (Chapter 4 of the
|
|---|
| 124 | Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
|
|---|
| 125 | (Chapter 14 of TUS3). Similarly for the C<script> property.
|
|---|
| 126 |
|
|---|
| 127 | Note that you cannot do (de)composition and casing based solely on the
|
|---|
| 128 | above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
|
|---|
| 129 | you will need also the compexcl(), casefold(), and casespec() functions.
|
|---|
| 130 |
|
|---|
| 131 | =cut
|
|---|
| 132 |
|
|---|
| 133 | # NB: This function is duplicated in charnames.pm
|
|---|
| 134 | sub _getcode {
|
|---|
| 135 | my $arg = shift;
|
|---|
|
|---|