source: trunk/essentials/dev-lang/perl/lib/Unicode/UCD.pm

Last change on this file was 3181, checked in by bird, 19 years ago

perl 5.8.8

File size: 22.3 KB
Line 
1package Unicode::UCD;
2
3use strict;
4use warnings;
5
6our $VERSION = '0.24';
7
8use Storable qw(dclone);
9
10require Exporter;
11
12our @ISA = qw(Exporter);
13
14our @EXPORT_OK = qw(charinfo
15 charblock charscript
16 charblocks charscripts
17 charinrange
18 compexcl
19 casefold casespec
20 namedseq);
21
22use Carp;
23
24=head1 NAME
25
26Unicode::UCD - Unicode character database
27
28=head1 SYNOPSIS
29
30 use Unicode::UCD 'charinfo';
31 my $charinfo = charinfo($codepoint);
32
33 use Unicode::UCD 'charblock';
34 my $charblock = charblock($codepoint);
35
36 use Unicode::UCD 'charscript';
37 my $charscript = charscript($codepoint);
38
39 use Unicode::UCD 'charblocks';
40 my $charblocks = charblocks();
41
42 use Unicode::UCD 'charscripts';
43 my %charscripts = charscripts();
44
45 use Unicode::UCD qw(charscript charinrange);
46 my $range = charscript($script);
47 print "looks like $script\n" if charinrange($range, $codepoint);
48
49 use Unicode::UCD 'compexcl';
50 my $compexcl = compexcl($codepoint);
51
52 use Unicode::UCD 'namedseq';
53 my $namedseq = namedseq($named_sequence_name);
54
55 my $unicode_version = Unicode::UCD::UnicodeVersion();
56
57=head1 DESCRIPTION
58
59The Unicode::UCD module offers a simple interface to the Unicode
60Character Database.
61
62=cut
63
64my $UNICODEFH;
65my $BLOCKSFH;
66my $SCRIPTSFH;
67my $VERSIONFH;
68my $COMPEXCLFH;
69my $CASEFOLDFH;
70my $CASESPECFH;
71my $NAMEDSEQFH;
72
73sub openunicode {
74 my ($rfh, @path) = @_;
75 my $f;
76 unless (defined $$rfh) {
77 for my $d (@INC) {
78 use File::Spec;
79 $f = File::Spec->catfile($d, "unicore", @path);
80 last if open($$rfh, $f);
81 undef $f;
82 }
83 croak __PACKAGE__, ": failed to find ",
84 File::Spec->catfile(@path), " in @INC"
85 unless defined $f;
86 }
87 return $f;
88}
89
90=head2 charinfo
91
92 use Unicode::UCD 'charinfo';
93
94 my $charinfo = charinfo(0x41);
95
96charinfo() returns a reference to a hash that has the following fields
97as defined by the Unicode standard:
98
99 key
100
101 code code point with at least four hexdigits
102 name name of the character IN UPPER CASE
103 category general category of the character
104 combining classes used in the Canonical Ordering Algorithm
105 bidi bidirectional category
106 decomposition character decomposition mapping
107 decimal if decimal digit this is the integer numeric value
108 digit if digit this is the numeric value
109 numeric if numeric is the integer or rational numeric value
110 mirrored if mirrored in bidirectional text
111 unicode10 Unicode 1.0 name if existed and different
112 comment ISO 10646 comment field
113 upper uppercase equivalent mapping
114 lower lowercase equivalent mapping
115 title titlecase equivalent mapping
116
117 block block the character belongs to (used in \p{In...})
118 script script the character belongs to
119
120If no match is found, a reference to an empty hash is returned.
121
122The C<block> property is the same as returned by charinfo(). It is
123not defined in the Unicode Character Database proper (Chapter 4 of the
124Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
125(Chapter 14 of TUS3). Similarly for the C<script> property.
126
127Note that you cannot do (de)composition and casing based solely on the
128above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
129you will need also the compexcl(), casefold(), and casespec() functions.
130
131=cut
132
133# NB: This function is duplicated in charnames.pm
134sub _getcode {
135 my $arg = shift;