| 1 | #!/usr/local/bin/perl
|
|---|
| 2 | #
|
|---|
| 3 | # $Id: ucmlint,v 2.0 2004/05/16 20:55:16 dankogai Exp $
|
|---|
| 4 | #
|
|---|
| 5 |
|
|---|
| 6 | use strict;
|
|---|
| 7 | our $VERSION = do { my @r = (q$Revision: 2.0 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
|
|---|
| 8 |
|
|---|
| 9 | use Getopt::Std;
|
|---|
| 10 | our %Opt;
|
|---|
| 11 | getopts("Dehfv", \%Opt);
|
|---|
| 12 |
|
|---|
| 13 | if ($Opt{e}){
|
|---|
| 14 | eval{ require Encode; };
|
|---|
| 15 | $@ and die "can't load Encode : $@";
|
|---|
| 16 | }
|
|---|
| 17 |
|
|---|
| 18 | $Opt{h} and help();
|
|---|
| 19 | @ARGV or help();
|
|---|
| 20 |
|
|---|
| 21 | sub help{
|
|---|
| 22 | print <<"";
|
|---|
| 23 | $0 -[Dehfv] [ucm files ...]
|
|---|
| 24 | -D debug mode on
|
|---|
| 25 | -e test with Encode module also (requires perl 5.7.3 or higher)
|
|---|
| 26 | -h shows this message
|
|---|
| 27 | -f forces roundtrip check even for |[123]
|
|---|
| 28 | -v verbose mode
|
|---|
| 29 |
|
|---|
| 30 | }
|
|---|
| 31 |
|
|---|
| 32 | $| = 1;
|
|---|
| 33 | my (%Hdr, %U2E, %E2U);
|
|---|
| 34 | my $in_charmap = 0;
|
|---|
| 35 | my $nerror = 0;
|
|---|
| 36 | my $nwarning = 0;
|
|---|
| 37 |
|
|---|
| 38 | sub nit($;$){
|
|---|
| 39 | my ($msg, $level) = @_;
|
|---|
| 40 | my $lstr;
|
|---|
| 41 | if ($level == 2){
|
|---|
| 42 | $lstr = 'notice';
|
|---|
| 43 | }elsif ($level == 1){
|
|---|
| 44 | $lstr = 'warning'; $nwarning++;
|
|---|
| 45 | }else{
|
|---|
| 46 | $lstr = 'error'; $nerror++;
|
|---|
| 47 | }
|
|---|
| 48 | print "$ARGV:$lstr in line $.: $msg\n";
|
|---|
| 49 | }
|
|---|
| 50 |
|
|---|
| 51 | for $ARGV (@ARGV){
|
|---|
| 52 | open UCM, $ARGV or die "$ARGV:$!";
|
|---|
| 53 | %Hdr = %U2E = %E2U = ();
|
|---|
| 54 | $in_charmap = $nerror = $nwarning = 0;
|
|---|
| 55 | $. = 0;
|
|---|
| 56 | while(<UCM>){
|
|---|
| 57 | chomp;
|
|---|
| 58 | s/\s*#.*$//o; /^$/ and next;
|
|---|
| 59 | if ($_ eq "CHARMAP"){
|
|---|
| 60 | $in_charmap = 1;
|
|---|
| 61 | for my $must (qw/code_set_name mb_cur_min mb_cur_max/){
|
|---|
| 62 | exists $Hdr{$must} or nit "<$must> nonexistent";
|
|---|
| 63 | }
|
|---|
| 64 | $Hdr{mb_cur_min} > $Hdr{mb_cur_max}
|
|---|
| 65 | and nit sprintf("mb_cur_min(%d) > mb_cur_max(%d)",
|
|---|
| 66 | $Hdr{mb_cur_min},$Hdr{mb_cur_max});
|
|---|
| 67 | $in_charmap = 1;
|
|---|
| 68 | next;
|
|---|
| 69 | }
|
|---|
| 70 | unless ($in_charmap){
|
|---|
| 71 | my($hkey, $hvalue) = /^<(\S+)>\s+[\"\']?([^\"\']+)/o or next;
|
|---|
| 72 | $Opt{D} and warn "$hkey => $hvalue";
|
|---|
| 73 | if ($hkey eq "code_set_name"){ # name check
|
|---|
| 74 | exists $Hdr{code_set_name}
|
|---|
| 75 | and nit "Duplicate <code_set_name>: $hkey";
|
|---|
| 76 | }
|
|---|
| 77 | if ($hkey eq "code_set_alias"){ # alias check
|
|---|
| 78 | $hvalue eq $Hdr{code_set_name}
|
|---|
| 79 | and nit qq(alias "$hvalue" is already in <code_set_name>);
|
|---|
| 80 | }
|
|---|
| 81 | $Hdr{$hkey} = $hvalue;
|
|---|
| 82 | }else{
|
|---|
| 83 | my $name = $Hdr{code_set_name};
|
|---|
| 84 | my($unistr, $encstr, $fb) = /^(\S+)\s+(\S+)\s(\S+)/o or next;
|
|---|
| 85 | $Opt{v} and nit $_, 2;
|
|---|
| 86 | my $uni = uniparse($unistr);
|
|---|
| 87 | my $enc = encparse($encstr);
|
|---|
| 88 | $fb =~ /^\|([0123])$/ or nit "malformed fallback: $fb";
|
|---|
| 89 | $fb = $1;
|
|---|
| 90 | $Opt{f} and $fb = 0;
|
|---|
| 91 | unless ($fb == 1){ # check uni -> enc
|
|---|
| 92 | if (exists $U2E{$uni}){
|
|---|
| 93 | nit "dupe encode map: U$uni => $U2E{$uni} and $enc", 1;
|
|---|
| 94 | }else{
|
|---|
| 95 | $U2E{$uni} = $enc;
|
|---|
| 96 | if ($Opt{e} and $fb != 3) {
|
|---|
| 97 | my $e = hex2enc($enc);
|
|---|
| 98 | my $u = hex2uni($uni);
|
|---|
| 99 | my $eu = Encode::encode($name, $u);
|
|---|
| 100 | $e eq $eu
|
|---|
| 101 | or nit qq(encode('$name', $uni) != $enc);
|
|---|
| 102 | }
|
|---|
| 103 | }
|
|---|
| 104 | }
|
|---|
| 105 | unless ($fb == 3){ # check enc -> uni
|
|---|
| 106 | if (exists $E2U{$enc}){
|
|---|
| 107 | nit "dupe decode map: $enc => U$E2U{$enc} and U$uni", 1;
|
|---|
| 108 | }else{
|
|---|
| 109 | $E2U{$enc} = $uni;
|
|---|
| 110 | if ($Opt{e} and $fb != 1) {
|
|---|
| 111 | my $e = hex2enc($enc);
|
|---|
| 112 | my $u = hex2uni($uni);
|
|---|
| 113 | $Opt{D} and warn "$uni, $enc";
|
|---|
| 114 | my $de = Encode::decode($name, $e);
|
|---|
| 115 | $de eq $u
|
|---|
| 116 | or nit qq(decode('$name', $enc) != $uni);
|
|---|
| 117 | }
|
|---|
| 118 | }
|
|---|
| 119 | }
|
|---|
| 120 | # warn "$uni, $enc, $fb";
|
|---|
| 121 | }
|
|---|
| 122 | }
|
|---|
| 123 | $in_charmap or nit "Where is CHARMAP?";
|
|---|
| 124 | checkRT();
|
|---|
| 125 | printf ("$ARGV: %s error%s found\n",
|
|---|
| 126 | ($nerror == 0 ? 'no' : $nerror),
|
|---|
| 127 | ($nerror > 1 ? 's' : ''));
|
|---|
| 128 | }
|
|---|
| 129 |
|
|---|
| 130 | exit;
|
|---|
| 131 |
|
|---|
| 132 | sub hex2enc{
|
|---|
| 133 | pack("C*", map {hex($_)} split(",", shift));
|
|---|
| 134 | }
|
|---|
| 135 | sub hex2uni{
|
|---|
| 136 | join("", map { chr(hex($_)) } split(",", shift));
|
|---|
| 137 | }
|
|---|
| 138 |
|
|---|
| 139 | sub checkRT{
|
|---|
| 140 | for my $uni (keys %E2U){
|
|---|
| 141 | my $enc = $U2E{$uni} or next; # okay
|
|---|
| 142 | $E2U{$U2E{$uni}} eq $uni or
|
|---|
| 143 | nit "RT failure: U$uni => $enc =>U$E2U{$U2E{$uni}}";
|
|---|
| 144 | }
|
|---|
| 145 | for my $enc (keys %E2U){
|
|---|
| 146 | my $uni = $E2U{$enc} or next; # okay
|
|---|
| 147 | $U2E{$E2U{$enc}} eq $enc or
|
|---|
| 148 | nit "RT failure: $enc => U$uni => $U2E{$E2U{$enc}}";
|
|---|
| 149 | }
|
|---|
| 150 | }
|
|---|
| 151 |
|
|---|
| 152 |
|
|---|
| 153 | sub uniparse{
|
|---|
| 154 | my $str = shift;
|
|---|
| 155 | my @u;
|
|---|
| 156 | push @u, $1 while($str =~ /\G<U(.*?)>/ig);
|
|---|
| 157 | for my $u (@u){
|
|---|
| 158 | $u =~ /^([0-9A-Za-z]+)$/o
|
|---|
| 159 | or nit "malformed Unicode character: $u";
|
|---|
| 160 | }
|
|---|
| 161 | return join(',', @u);
|
|---|
| 162 | }
|
|---|
| 163 |
|
|---|
| 164 | sub encparse{
|
|---|
| 165 | my $str = shift;
|
|---|
| 166 | my @e;
|
|---|
| 167 | for my $e (split /\\x/io, $str){
|
|---|
| 168 | $e or next; # first \x
|
|---|
| 169 | $e =~ /^([0-9A-Za-z]{1,2})$/io
|
|---|
| 170 | or nit "Hex $e in $str is bogus";
|
|---|
| 171 | push @e, $1;
|
|---|
| 172 | }
|
|---|
| 173 | return join(',', @e);
|
|---|
| 174 | }
|
|---|
| 175 |
|
|---|
| 176 |
|
|---|
| 177 |
|
|---|
| 178 | __END__
|
|---|
| 179 |
|
|---|
| 180 | A UCM file looks like this.
|
|---|
| 181 |
|
|---|
| 182 | #
|
|---|
| 183 | # Comments
|
|---|
| 184 | #
|
|---|
| 185 | <code_set_name> "US-ascii" # Required
|
|---|
| 186 | <code_set_alias> "ascii" # Optional
|
|---|
| 187 | <mb_cur_min> 1 # Required; usually 1
|
|---|
| 188 | <mb_cur_max> 1 # Max. # of bytes/char
|
|---|
| 189 | <subchar> \x3F # Substitution char
|
|---|
| 190 | #
|
|---|
| 191 | CHARMAP
|
|---|
| 192 | <U0000> \x00 |0 # <control>
|
|---|
| 193 | <U0001> \x01 |0 # <control>
|
|---|
| 194 | <U0002> \x02 |0 # <control>
|
|---|
| 195 | ....
|
|---|
| 196 | <U007C> \x7C |0 # VERTICAL LINE
|
|---|
| 197 | <U007D> \x7D |0 # RIGHT CURLY BRACKET
|
|---|
| 198 | <U007E> \x7E |0 # TILDE
|
|---|
| 199 | <U007F> \x7F |0 # <control>
|
|---|
| 200 | END CHARMAP
|
|---|
| 201 |
|
|---|