Context Navigation

UCD.pm

Visit:

Last change on this file was 3181, checked in by bird, 19 years ago
perl 5.8.8
File size: 22.3 KB

Rev	Line
[3181]	1	package Unicode::UCD;
	2
	3	use strict;
	4	use warnings;
	5
	6	our $VERSION = '0.24';
	7
	8	use Storable qw(dclone);
	9
	10	require Exporter;
	11
	12	our @ISA = qw(Exporter);
	13
	14	our @EXPORT_OK = qw(charinfo
	15	charblock charscript
	16	charblocks charscripts
	17	charinrange
	18	compexcl
	19	casefold casespec
	20	namedseq);
	21
	22	use Carp;
	23
	24	=head1 NAME
	25
	26	Unicode::UCD - Unicode character database
	27
	28	=head1 SYNOPSIS
	29
	30	use Unicode::UCD 'charinfo';
	31	my $charinfo = charinfo($codepoint);
	32
	33	use Unicode::UCD 'charblock';
	34	my $charblock = charblock($codepoint);
	35
	36	use Unicode::UCD 'charscript';
	37	my $charscript = charscript($codepoint);
	38
	39	use Unicode::UCD 'charblocks';
	40	my $charblocks = charblocks();
	41
	42	use Unicode::UCD 'charscripts';
	43	my %charscripts = charscripts();
	44
	45	use Unicode::UCD qw(charscript charinrange);
	46	my $range = charscript($script);
	47	print "looks like $script\n" if charinrange($range, $codepoint);
	48
	49	use Unicode::UCD 'compexcl';
	50	my $compexcl = compexcl($codepoint);
	51
	52	use Unicode::UCD 'namedseq';
	53	my $namedseq = namedseq($named_sequence_name);
	54
	55	my $unicode_version = Unicode::UCD::UnicodeVersion();
	56
	57	=head1 DESCRIPTION
	58
	59	The Unicode::UCD module offers a simple interface to the Unicode
	60	Character Database.
	61
	62	=cut
	63
	64	my $UNICODEFH;
	65	my $BLOCKSFH;
	66	my $SCRIPTSFH;
	67	my $VERSIONFH;
	68	my $COMPEXCLFH;
	69	my $CASEFOLDFH;
	70	my $CASESPECFH;
	71	my $NAMEDSEQFH;
	72
	73	sub openunicode {
	74	my ($rfh, @path) = @_;
	75	my $f;
	76	unless (defined $$rfh) {
	77	for my $d (@INC) {
	78	use File::Spec;
	79	$f = File::Spec->catfile($d, "unicore", @path);
	80	last if open($$rfh, $f);
	81	undef $f;
	82	}
	83	croak __PACKAGE__, ": failed to find ",
	84	File::Spec->catfile(@path), " in @INC"
	85	unless defined $f;
	86	}
	87	return $f;
	88	}
	89
	90	=head2 charinfo
	91
	92	use Unicode::UCD 'charinfo';
	93
	94	my $charinfo = charinfo(0x41);
	95
	96	charinfo() returns a reference to a hash that has the following fields
	97	as defined by the Unicode standard:
	98
	99	key
	100
	101	code code point with at least four hexdigits
	102	name name of the character IN UPPER CASE
	103	category general category of the character
	104	combining classes used in the Canonical Ordering Algorithm
	105	bidi bidirectional category
	106	decomposition character decomposition mapping
	107	decimal if decimal digit this is the integer numeric value
	108	digit if digit this is the numeric value
	109	numeric if numeric is the integer or rational numeric value
	110	mirrored if mirrored in bidirectional text
	111	unicode10 Unicode 1.0 name if existed and different
	112	comment ISO 10646 comment field
	113	upper uppercase equivalent mapping
	114	lower lowercase equivalent mapping
	115	title titlecase equivalent mapping
	116
	117	block block the character belongs to (used in \p{In...})
	118	script script the character belongs to
	119
	120	If no match is found, a reference to an empty hash is returned.
	121
	122	The C<block> property is the same as returned by charinfo(). It is
	123	not defined in the Unicode Character Database proper (Chapter 4 of the
	124	Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
	125	(Chapter 14 of TUS3). Similarly for the C<script> property.
	126
	127	Note that you cannot do (de)composition and casing based solely on the
	128	above C<decomposition> and C<lower>, C<upper>, C<title>, properties,
	129	you will need also the compexcl(), casefold(), and casespec() functions.
	130
	131	=cut
	132
	133	# NB: This function is duplicated in charnames.pm
	134	sub _getcode {
	135	my $arg = shift;
	136
	137	if ($arg =~ /^[1-9]\d*$/) {
	138	return $arg;
	139	} elsif ($arg =~ /^(?:[Uu]\+\|0[xX])?([[:xdigit:]]+)$/) {
	140	return hex($1);
	141	}
	142
	143	return;
	144	}
	145
	146	# Lingua::KO::Hangul::Util not part of the standard distribution
	147	# but it will be used if available.
	148
	149	eval { require Lingua::KO::Hangul::Util };
	150	my $hasHangulUtil = ! $@;
	151	if ($hasHangulUtil) {
	152	Lingua::KO::Hangul::Util->import();
	153	}
	154
	155	sub hangul_decomp { # internal: called from charinfo
	156	if ($hasHangulUtil) {
	157	my @tmp = decomposeHangul(shift);
	158	return sprintf("%04X %04X", @tmp) if @tmp == 2;
	159	return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
	160	}
	161	return;
	162	}
	163
	164	sub hangul_charname { # internal: called from charinfo
	165	return sprintf("HANGUL SYLLABLE-%04X", shift);
	166	}
	167
	168	sub han_charname { # internal: called from charinfo
	169	return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
	170	}
	171
	172	my @CharinfoRanges = (
	173	# block name
	174	# [ first, last, coderef to name, coderef to decompose ],
	175	# CJK Ideographs Extension A
	176	[ 0x3400, 0x4DB5, \&han_charname, undef ],
	177	# CJK Ideographs
	178	[ 0x4E00, 0x9FA5, \&han_charname, undef ],
	179	# Hangul Syllables
	180	[ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],
	181	# Non-Private Use High Surrogates
	182	[ 0xD800, 0xDB7F, undef, undef ],
	183	# Private Use High Surrogates
	184	[ 0xDB80, 0xDBFF, undef, undef ],
	185	# Low Surrogates
	186	[ 0xDC00, 0xDFFF, undef, undef ],
	187	# The Private Use Area
	188	[ 0xE000, 0xF8FF, undef, undef ],
	189	# CJK Ideographs Extension B
	190	[ 0x20000, 0x2A6D6, \&han_charname, undef ],
	191	# Plane 15 Private Use Area
	192	[ 0xF0000, 0xFFFFD, undef, undef ],
	193	# Plane 16 Private Use Area
	194	[ 0x100000, 0x10FFFD, undef, undef ],
	195	);
	196
	197	sub charinfo {
	198	my $arg = shift;
	199	my $code = _getcode($arg);
	200	croak __PACKAGE__, "::charinfo: unknown code '$arg'"
	201	unless defined $code;
	202	my $hexk = sprintf("%06X", $code);
	203	my($rcode,$rname,$rdec);
	204	foreach my $range (@CharinfoRanges){
	205	if ($range->[0] <= $code && $code <= $range->[1]) {
	206	$rcode = $hexk;
	207	$rcode =~ s/^0+//;
	208	$rcode = sprintf("%04X", hex($rcode));
	209	$rname = $range->[2] ? $range->[2]->($code) : '';
	210	$rdec = $range->[3] ? $range->[3]->($code) : '';
	211	$hexk = sprintf("%06X", $range->[0]); # replace by the first
	212	last;
	213	}
	214	}
	215	openunicode(\$UNICODEFH, "UnicodeData.txt");
	216	if (defined $UNICODEFH) {
	217	use Search::Dict 1.02;
	218	if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
	219	my $line = <$UNICODEFH>;
	220	return unless defined $line;
	221	chomp $line;
	222	my %prop;
	223	@prop{qw(
	224	code name category
	225	combining bidi decomposition
	226	decimal digit numeric
	227	mirrored unicode10 comment
	228	upper lower title
	229	)} = split(/;/, $line, -1);
	230	$hexk =~ s/^0+//;
	231	$hexk = sprintf("%04X", hex($hexk));
	232	if ($prop{code} eq $hexk) {
	233	$prop{block} = charblock($code);
	234	$prop{script} = charscript($code);
	235	if(defined $rname){
	236	$prop{code} = $rcode;
	237	$prop{name} = $rname;
	238	$prop{decomposition} = $rdec;
	239	}
	240	return \%prop;
	241	}
	242	}
	243	}
	244	return;
	245	}
	246
	247	sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
	248	my ($table, $lo, $hi, $code) = @_;
	249
	250	return if $lo > $hi;
	251
	252	my $mid = int(($lo+$hi) / 2);
	253
	254	if ($table->[$mid]->[0] < $code) {
	255	if ($table->[$mid]->[1] >= $code) {
	256	return $table->[$mid]->[2];
	257	} else {
	258	_search($table, $mid + 1, $hi, $code);
	259	}
	260	} elsif ($table->[$mid]->[0] > $code) {
	261	_search($table, $lo, $mid - 1, $code);
	262	} else {
	263	return $table->[$mid]->[2];
	264	}
	265	}
	266
	267	sub charinrange {
	268	my ($range, $arg) = @_;
	269	my $code = _getcode($arg);
	270	croak __PACKAGE__, "::charinrange: unknown code '$arg'"
	271	unless defined $code;
	272	_search($range, 0, $#$range, $code);
	273	}
	274
	275	=head2 charblock
	276
	277	use Unicode::UCD 'charblock';
	278
	279	my $charblock = charblock(0x41);
	280	my $charblock = charblock(1234);
	281	my $charblock = charblock("0x263a");
	282	my $charblock = charblock("U+263a");
	283
	284	my $range = charblock('Armenian');
	285
	286	With a B<code point argument> charblock() returns the I<block> the character
	287	belongs to, e.g. C<Basic Latin>. Note that not all the character
	288	positions within all blocks are defined.
	289
	290	See also L</Blocks versus Scripts>.
	291
	292	If supplied with an argument that can't be a code point, charblock() tries
	293	to do the opposite and interpret the argument as a character block. The
	294	return value is a I<range>: an anonymous list of lists that contain
	295	I<start-of-range>, I<end-of-range> code point pairs. You can test whether
	296	a code point is in a range using the L</charinrange> function. If the
	297	argument is not a known character block, C<undef> is returned.
	298
	299	=cut
	300
	301	my @BLOCKS;
	302	my %BLOCKS;
	303
	304	sub _charblocks {
	305	unless (@BLOCKS) {
	306	if (openunicode(\$BLOCKSFH, "Blocks.txt")) {
	307	local $_;
	308	while (<$BLOCKSFH>) {
	309	if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
	310	my ($lo, $hi) = (hex($1), hex($2));
	311	my $subrange = [ $lo, $hi, $3 ];
	312	push @BLOCKS, $subrange;
	313	push @{$BLOCKS{$3}}, $subrange;
	314	}
	315	}
	316	close($BLOCKSFH);
	317	}
	318	}
	319	}
	320
	321	sub charblock {
	322	my $arg = shift;
	323
	324	_charblocks() unless @BLOCKS;
	325
	326	my $code = _getcode($arg);
	327
	328	if (defined $code) {
	329	_search(\@BLOCKS, 0, $#BLOCKS, $code);
	330	} else {
	331	if (exists $BLOCKS{$arg}) {
	332	return dclone $BLOCKS{$arg};
	333	} else {
	334	return;
	335	}
	336	}
	337	}
	338
	339	=head2 charscript
	340
	341	use Unicode::UCD 'charscript';
	342
	343	my $charscript = charscript(0x41);
	344	my $charscript = charscript(1234);
	345	my $charscript = charscript("U+263a");
	346
	347	my $range = charscript('Thai');
	348
	349	With a B<code point argument> charscript() returns the I<script> the
	350	character belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
	351
	352	See also L</Blocks versus Scripts>.
	353
	354	If supplied with an argument that can't be a code point, charscript() tries
	355	to do the opposite and interpret the argument as a character script. The
	356	return value is a I<range>: an anonymous list of lists that contain
	357	I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
	358	code point is in a range using the L</charinrange> function. If the
	359	argument is not a known character script, C<undef> is returned.
	360
	361	=cut
	362
	363	my @SCRIPTS;
	364	my %SCRIPTS;
	365
	366	sub _charscripts {
	367	unless (@SCRIPTS) {
	368	if (openunicode(\$SCRIPTSFH, "Scripts.txt")) {
	369	local $_;
	370	while (<$SCRIPTSFH>) {
	371	if (/^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+)/) {
	372	my ($lo, $hi) = (hex($1), $2 ? hex($2) : hex($1));
	373	my $script = lc($3);
	374	$script =~ s/\b(\w)/uc($1)/ge;
	375	my $subrange = [ $lo, $hi, $script ];
	376	push @SCRIPTS, $subrange;
	377	push @{$SCRIPTS{$script}}, $subrange;
	378	}
	379	}
	380	close($SCRIPTSFH);
	381	@SCRIPTS = sort { $a->[0] <=> $b->[0] } @SCRIPTS;
	382	}
	383	}
	384	}
	385
	386	sub charscript {
	387	my $arg = shift;
	388
	389	_charscripts() unless @SCRIPTS;
	390
	391	my $code = _getcode($arg);
	392
	393	if (defined $code) {
	394	_search(\@SCRIPTS, 0, $#SCRIPTS, $code);
	395	} else {
	396	if (exists $SCRIPTS{$arg}) {
	397	return dclone $SCRIPTS{$arg};
	398	} else {
	399	return;
	400	}
	401	}
	402	}
	403
	404	=head2 charblocks
	405
	406	use Unicode::UCD 'charblocks';
	407
	408	my $charblocks = charblocks();
	409
	410	charblocks() returns a reference to a hash with the known block names
	411	as the keys, and the code point ranges (see L</charblock>) as the values.
	412
	413	See also L</Blocks versus Scripts>.
	414
	415	=cut
	416
	417	sub charblocks {
	418	_charblocks() unless %BLOCKS;
	419	return dclone \%BLOCKS;
	420	}
	421
	422	=head2 charscripts
	423
	424	use Unicode::UCD 'charscripts';
	425
	426	my %charscripts = charscripts();
	427
	428	charscripts() returns a hash with the known script names as the keys,
	429	and the code point ranges (see L</charscript>) as the values.
	430
	431	See also L</Blocks versus Scripts>.
	432
	433	=cut
	434
	435	sub charscripts {
	436	_charscripts() unless %SCRIPTS;
	437	return dclone \%SCRIPTS;
	438	}
	439
	440	=head2 Blocks versus Scripts
	441
	442	The difference between a block and a script is that scripts are closer
	443	to the linguistic notion of a set of characters required to present
	444	languages, while block is more of an artifact of the Unicode character
	445	numbering and separation into blocks of (mostly) 256 characters.
	446
	447	For example the Latin B<script> is spread over several B<blocks>, such
	448	as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
	449	C<Latin Extended-B>. On the other hand, the Latin script does not
	450	contain all the characters of the C<Basic Latin> block (also known as
	451	the ASCII): it includes only the letters, and not, for example, the digits
	452	or the punctuation.
	453
	454	For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
	455
	456	For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
	457
	458	=head2 Matching Scripts and Blocks
	459
	460	Scripts are matched with the regular-expression construct
	461	C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
	462	while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
	463	any of the 256 code points in the Tibetan block).
	464
	465	=head2 Code Point Arguments
	466
	467	A I<code point argument> is either a decimal or a hexadecimal scalar
	468	designating a Unicode character, or C<U+> followed by hexadecimals
	469	designating a Unicode character. In other words, if you want a code
	470	point to be interpreted as a hexadecimal number, you must prefix it
	471	with either C<0x> or C<U+>, because a string like e.g. C<123> will
	472	be interpreted as a decimal code point. Also note that Unicode is
	473	B<not> limited to 16 bits (the number of Unicode characters is
	474	open-ended, in theory unlimited): you may have more than 4 hexdigits.
	475
	476	=head2 charinrange
	477
	478	In addition to using the C<\p{In...}> and C<\P{In...}> constructs, you
	479	can also test whether a code point is in the I<range> as returned by
	480	L</charblock> and L</charscript> or as the values of the hash returned
	481	by L</charblocks> and L</charscripts> by using charinrange():
	482
	483	use Unicode::UCD qw(charscript charinrange);
	484
	485	$range = charscript('Hiragana');
	486	print "looks like hiragana\n" if charinrange($range, $codepoint);
	487
	488	=cut
	489
	490	=head2 compexcl
	491
	492	use Unicode::UCD 'compexcl';
	493
	494	my $compexcl = compexcl("09dc");
	495
	496	The compexcl() returns the composition exclusion (that is, if the
	497	character should not be produced during a precomposition) of the
	498	character specified by a B<code point argument>.
	499
	500	If there is a composition exclusion for the character, true is
	501	returned. Otherwise, false is returned.
	502
	503	=cut
	504
	505	my %COMPEXCL;
	506
	507	sub _compexcl {
	508	unless (%COMPEXCL) {
	509	if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
	510	local $_;
	511	while (<$COMPEXCLFH>) {
	512	if (/^([0-9A-F]+)\s+\#\s+/) {
	513	my $code = hex($1);
	514	$COMPEXCL{$code} = undef;
	515	}
	516	}
	517	close($COMPEXCLFH);
	518	}
	519	}
	520	}
	521
	522	sub compexcl {
	523	my $arg = shift;
	524	my $code = _getcode($arg);
	525	croak __PACKAGE__, "::compexcl: unknown code '$arg'"
	526	unless defined $code;
	527
	528	_compexcl() unless %COMPEXCL;
	529
	530	return exists $COMPEXCL{$code};
	531	}
	532
	533	=head2 casefold
	534
	535	use Unicode::UCD 'casefold';
	536
	537	my $casefold = casefold("00DF");
	538
	539	The casefold() returns the locale-independent case folding of the
	540	character specified by a B<code point argument>.
	541
	542	If there is a case folding for that character, a reference to a hash
	543	with the following fields is returned:
	544
	545	key
	546
	547	code code point with at least four hexdigits
	548	status "C", "F", "S", or "I"
	549	mapping one or more codes separated by spaces
	550
	551	The meaning of the I<status> is as follows:
	552
	553	C common case folding, common mappings shared
	554	by both simple and full mappings
	555	F full case folding, mappings that cause strings
	556	to grow in length. Multiple characters are separated
	557	by spaces
	558	S simple case folding, mappings to single characters
	559	where different from F
	560	I special case for dotted uppercase I and
	561	dotless lowercase i
	562	- If this mapping is included, the result is
	563	case-insensitive, but dotless and dotted I's
	564	are not distinguished
	565	- If this mapping is excluded, the result is not
	566	fully case-insensitive, but dotless and dotted
	567	I's are distinguished
	568
	569	If there is no case folding for that character, C<undef> is returned.
	570
	571	For more information about case mappings see
	572	http://www.unicode.org/unicode/reports/tr21/
	573
	574	=cut
	575
	576	my %CASEFOLD;
	577
	578	sub _casefold {
	579	unless (%CASEFOLD) {
	580	if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
	581	local $_;
	582	while (<$CASEFOLDFH>) {
	583	if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
	584	my $code = hex($1);
	585	$CASEFOLD{$code} = { code => $1,
	586	status => $2,
	587	mapping => $3 };
	588	}
	589	}
	590	close($CASEFOLDFH);
	591	}
	592	}
	593	}
	594
	595	sub casefold {
	596	my $arg = shift;
	597	my $code = _getcode($arg);
	598	croak __PACKAGE__, "::casefold: unknown code '$arg'"
	599	unless defined $code;
	600
	601	_casefold() unless %CASEFOLD;
	602
	603	return $CASEFOLD{$code};
	604	}
	605
	606	=head2 casespec
	607
	608	use Unicode::UCD 'casespec';
	609
	610	my $casespec = casespec("FB00");
	611
	612	The casespec() returns the potentially locale-dependent case mapping
	613	of the character specified by a B<code point argument>. The mapping
	614	may change the length of the string (which the basic Unicode case
	615	mappings as returned by charinfo() never do).
	616
	617	If there is a case folding for that character, a reference to a hash
	618	with the following fields is returned:
	619
	620	key
	621
	622	code code point with at least four hexdigits
	623	lower lowercase
	624	title titlecase
	625	upper uppercase
	626	condition condition list (may be undef)
	627
	628	The C<condition> is optional. Where present, it consists of one or
	629	more I<locales> or I<contexts>, separated by spaces (other than as
	630	used to separate elements, spaces are to be ignored). A condition
	631	list overrides the normal behavior if all of the listed conditions are
	632	true. Case distinctions in the condition list are not significant.
	633	Conditions preceded by "NON_" represent the negation of the condition.
	634
	635	Note that when there are multiple case folding definitions for a
	636	single code point because of different locales, the value returned by
	637	casespec() is a hash reference which has the locales as the keys and
	638	hash references as described above as the values.
	639
	640	A I<locale> is defined as a 2-letter ISO 3166 country code, possibly
	641	followed by a "_" and a 2-letter ISO language code (possibly followed
	642	by a "_" and a variant code). You can find the lists of those codes,
	643	see L<Locale::Country> and L<Locale::Language>.
	644
	645	A I<context> is one of the following choices:
	646
	647	FINAL The letter is not followed by a letter of
	648	general category L (e.g. Ll, Lt, Lu, Lm, or Lo)
	649	MODERN The mapping is only used for modern text
	650	AFTER_i The last base character was "i" (U+0069)
	651
	652	For more information about case mappings see
	653	http://www.unicode.org/unicode/reports/tr21/
	654
	655	=cut
	656
	657	my %CASESPEC;
	658
	659	sub _casespec {
	660	unless (%CASESPEC) {
	661	if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
	662	local $_;
	663	while (<$CASESPECFH>) {
	664	if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+))?; ([0-9A-F]+(?: [0-9A-F]+))?; ([0-9A-F]+(?: [0-9A-F]+))?; (\w+(?: \w+))?/) {
	665	my ($hexcode, $lower, $title, $upper, $condition) =
	666	($1, $2, $3, $4, $5);
	667	my $code = hex($hexcode);
	668	if (exists $CASESPEC{$code}) {
	669	if (exists $CASESPEC{$code}->{code}) {
	670	my ($oldlower,
	671	$oldtitle,
	672	$oldupper,
	673	$oldcondition) =
	674	@{$CASESPEC{$code}}{qw(lower
	675	title
	676	upper
	677	condition)};
	678	if (defined $oldcondition) {
	679	my ($oldlocale) =
	680	($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
	681	delete $CASESPEC{$code};
	682	$CASESPEC{$code}->{$oldlocale} =
	683	{ code => $hexcode,
	684	lower => $oldlower,
	685	title => $oldtitle,
	686	upper => $oldupper,
	687	condition => $oldcondition };
	688	}
	689	}
	690	my ($locale) =
	691	($condition =~ /^([a-z][a-z](?:_\S+)?)/);
	692	$CASESPEC{$code}->{$locale} =
	693	{ code => $hexcode,
	694	lower => $lower,
	695	title => $title,
	696	upper => $upper,
	697	condition => $condition };
	698	} else {
	699	$CASESPEC{$code} =
	700	{ code => $hexcode,
	701	lower => $lower,
	702	title => $title,
	703	upper => $upper,
	704	condition => $condition };
	705	}
	706	}
	707	}
	708	close($CASESPECFH);
	709	}
	710	}
	711	}
	712
	713	sub casespec {
	714	my $arg = shift;
	715	my $code = _getcode($arg);
	716	croak __PACKAGE__, "::casespec: unknown code '$arg'"
	717	unless defined $code;
	718
	719	_casespec() unless %CASESPEC;
	720
	721	return ref $CASESPEC{$code} ? dclone $CASESPEC{$code} : $CASESPEC{$code};
	722	}
	723
	724	=head2 namedseq()
	725
	726	use Unicode::UCD 'namedseq';
	727
	728	my $namedseq = namedseq("KATAKANA LETTER AINU P");
	729	my @namedseq = namedseq("KATAKANA LETTER AINU P");
	730	my %namedseq = namedseq();
	731
	732	If used with a single argument in a scalar context, returns the string
	733	consisting of the code points of the named sequence, or C<undef> if no
	734	named sequence by that name exists. If used with a single argument in
	735	a list context, returns list of the code points. If used with no
	736	arguments in a list context, returns a hash with the names of the
	737	named sequences as the keys and the named sequences as strings as
	738	the values. Otherwise, returns C<undef> or empty list depending
	739	on the context.
	740
	741	(New from Unicode 4.1.0)
	742
	743	=cut
	744
	745	my %NAMEDSEQ;
	746
	747	sub _namedseq {
	748	unless (%NAMEDSEQ) {
	749	if (openunicode(\$NAMEDSEQFH, "NamedSequences.txt")) {
	750	local $_;
	751	while (<$NAMEDSEQFH>) {
	752	if (/^(.+)\s;\s([0-9A-F]+(?: [0-9A-F]+)*)$/) {
	753	my ($n, $s) = ($1, $2);
	754	my @s = map { chr(hex($_)) } split(' ', $s);
	755	$NAMEDSEQ{$n} = join("", @s);
	756	}
	757	}
	758	close($NAMEDSEQFH);
	759	}
	760	}
	761	}
	762
	763	sub namedseq {
	764	_namedseq() unless %NAMEDSEQ;
	765	my $wantarray = wantarray();
	766	if (defined $wantarray) {
	767	if ($wantarray) {
	768	if (@_ == 0) {
	769	return %NAMEDSEQ;
	770	} elsif (@_ == 1) {
	771	my $s = $NAMEDSEQ{ $_[0] };
	772	return defined $s ? map { ord($_) } split('', $s) : ();
	773	}
	774	} elsif (@_ == 1) {
	775	return $NAMEDSEQ{ $_[0] };
	776	}
	777	}
	778	return;
	779	}
	780
	781	=head2 Unicode::UCD::UnicodeVersion
	782
	783	Unicode::UCD::UnicodeVersion() returns the version of the Unicode
	784	Character Database, in other words, the version of the Unicode
	785	standard the database implements. The version is a string
	786	of numbers delimited by dots (C<'.'>).
	787
	788	=cut
	789
	790	my $UNICODEVERSION;
	791
	792	sub UnicodeVersion {
	793	unless (defined $UNICODEVERSION) {
	794	openunicode(\$VERSIONFH, "version");
	795	chomp($UNICODEVERSION = <$VERSIONFH>);
	796	close($VERSIONFH);
	797	croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
	798	unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
	799	}
	800	return $UNICODEVERSION;
	801	}
	802
	803	=head2 Implementation Note
	804
	805	The first use of charinfo() opens a read-only filehandle to the Unicode
	806	Character Database (the database is included in the Perl distribution).
	807	The filehandle is then kept open for further queries. In other words,
	808	if you are wondering where one of your filehandles went, that's where.
	809
	810	=head1 BUGS
	811
	812	Does not yet support EBCDIC platforms.
	813
	814	=head1 AUTHOR
	815
	816	Jarkko Hietaniemi
	817
	818	=cut
	819
	820	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: vendor/perl/5.8.8/lib/Unicode/UCD.pm

Download in other formats: