package Unicode::Regex::Set;

require 5.008;

use strict;
use warnings;
use Carp;

require Exporter;
our @ISA = qw(Exporter);

our @EXPORT_OK = qw(parse maketree tostring);
our @EXPORT    = ();

our $VERSION = '0.04';
our $PACKAGE = __PACKAGE__;

use constant TRUE  => 1;
use constant FALSE => '';

my %Meaning = (
    '[' => 'group beginning',
    ']' => 'group end',
    '&' => 'intersection',
    '|' => 'union',
    ''  => 'union',
    '-' => 'subtraction',
);

#  Token combination table:  e.g  '[' followed by '&' is NG.
#
#	1\2   '['  ']'  '&'  '|'  '-'  Lit
#	'['   OK   NG   NG   NG   NG   OK
#	']'   OK   OK   OK   OK   OK   OK
#	'&'   OK   NG   NG   NG   NG   OK
#	'|'   OK   NG   NG   NG   NG   OK
#	'-'   OK   NG   NG   NG   NG   OK
#	Lit   OK   OK   OK   OK   OK   OK
#
#  Lit, literal, includes A-Z, \[, \|, \-, '\ ' (escaped space), \n, \r,
#       \t, \f, \cA, \ooo, \xhh, \x{hhhh}, \p{Prop}, \N{NAME}, [:posix:].
#       They are retained as they are.
# [=oops=] are not considered.

sub parse { tostring(maketree(@_)) }

#   $node = {
#	parent  => $node_or_undef, # undef for root
#	neg     => $boolean, # true if group begins with '[^'
#	follow  => $boolean, # true if requires literal
#	op      => $char,    # '&', '-', '|'
#	childs  => $arrayref_of_nodes,
#    }

sub maketree {
    my $cur;
    my $arg = shift;

    foreach (ref $arg ? $$arg : $arg) # store in $_
    {
	if (!s/^\[//) {
	    croak "a character class not beginning at [";
	}
	$cur = { parent => undef, op => FALSE, childs => [] };
	s/^\^// and $cur->{neg} = TRUE;

	while (1) {

	    # skip whitespaces
	    if (s/^\s+//) {
		next;
	    }

	    # beginning of a group
	    if (s/^\[  (?! \: [^\[\]]+ \:\] )//x) {
		if ($cur->{op} eq '&' && !$cur->{follow}) {
		    $cur = $cur->{parent};
		}

		push @{ $cur->{childs} },
			+{ parent => $cur, op => FALSE, childs => [] };

		$cur = $cur->{childs}->[-1];
		s/^\^// and $cur->{neg} = TRUE;
		next;
	    }

	    # end of a group
	    if (s/^\]//) {
		if (! $cur->{childs} || ! @{ $cur->{childs} }) {
		    croak "empty (sub)group in a character class";
		}

		if ($cur->{op} eq '&' && !$cur->{follow}) {
		    $cur = $cur->{parent};
		}

	    # LAST:
		last if ! $cur->{parent};

		if ($cur->{follow}) {
		    my $op = $cur->{op};
		    croak "no operand after '$op' ($Meaning{$op})";
		}

		$cur = $cur->{parent};

		$cur->{follow} and $cur->{follow} = FALSE;
		next;
	    }

	    # operators
	    if (s/^([\&\|\-])(?=[\s\[\]])//) {
		my $o = $1;

		if (! $cur->{childs} || ! @{ $cur->{childs} }) {
		    croak "no operand before '$o' ($Meaning{$o})";
		}

		if ($cur->{follow}) {
		    my $p = $cur->{op};
		    croak "no operand between '$p' ($Meaning{$p}) "
			. "and '$o' ($Meaning{$o})";
		}

		if ($cur->{op} eq $o)
		{
		    $cur->{follow} = TRUE;
		    next;
		}

		if ($cur->{op} eq '&' && !$cur->{follow})
		    # in this case $op must not be '&' (see the prev block)
		    # '&' has high precedence: [A & B - C] as [[A & B] - C]
		{
		    $cur = $cur->{parent};
		}

		if ($o eq '&')
		    # '&' has high precedence: [A B & C D] as [A [B & C] D]
		{
		    my $last = pop @{ $cur->{childs} };

		    push @{ $cur->{childs} },
			{ parent => $cur, op => $o, childs => [ $last ] };

		    $cur = $cur->{childs}->[-1];
		    $cur->{follow} = TRUE;
		    next;
		}

		if ($o eq '-') {
		    if (@{ $cur->{childs} } > 1)
			# '-' has low precedence: [A B - C] as [[A B] - C]
		    {
			my @kids = @{ $cur->{childs} };
			@{ $cur->{childs} } =
			    { parent => $cur, op => FALSE, childs => \@kids };
		    }

		    $cur->{op} = $o;
		    next;
		}

		if ($o eq '|') { # simple union
		    $cur->{op} = $o;
		    next;
		}
	    }


	    if (s/^((?:
		    \\[pPN]\{ [^{}]* \}
		  | \\c?(?s:.)
		  | [^\s\[\]]
		  | \[\: [^\[\]]+ \:\]
			)+)//x)
	    {
		my $lit = $1;

		if ($lit eq '^') {
		    croak "A bare '^', that has nothing to be negated.";
		}

		if ($cur->{op} eq '&' && !$cur->{follow})
		    # '&' has high precedence: [A & B C] as [[A & B] C]
		{
		    $cur = $cur->{parent};
		}

		$cur->{follow} and $cur->{follow} = FALSE;
		my $kid = $cur->{childs};

		if (@$kid
		    && ! ref($kid->[-1])
		    && $lit	  !~ /^[\-\^]/
		    && $kid->[-1] !~ /^\[\^/
		    && $kid->[-1] !~ /\-\]\z/
		    && $cur->{op} ne '&'
		    && !($cur->{op} eq '-' && @$kid == 1))
		# this is only simplification, so avoids uncertain cases
		{
		    substr($kid->[-1], -1, 0, $lit);
		}
		else {
		    push @$kid, "[$lit]";
		}
		next;
	    }

	    croak "panic or incomplete character class (missing last ']'?);";
	}
    }

    return $cur;
}

sub tostring {
    my $list = shift;

    for (@{ $list->{childs} }) {
	next  if !ref($_);
	croak "panic" if ref($_) ne 'HASH';
	$_ = tostring($_); # recursive
    }
    my $ret;
    my $op   = $list->{op} || FALSE;
    my $kids = $list->{childs};

    if ($op eq '&') {
	my $base = shift @$kids;
	my $pre  = join '', map "(?=$_)", @$kids;
	$ret = "(?:$pre$base)";
    }
    elsif ($op eq '-') {
	my $base = shift @$kids;
	my $pre  = join('|', @$kids);
	$ret = "(?:(?!$pre)$base)";
    }
    else {
	$ret = @$kids > 1 ? "(?:".join('|', @$kids).")" : $kids->[0];
    }
    return $list->{neg} ? "(?:(?!$ret)(?s:.))" : $ret;
}

1;
__END__

=head1 NAME

Unicode::Regex::Set - Subtraction and Intersection of Character Sets
in Unicode Regular Expressions

=head1 SYNOPSIS

    use Unicode::Regex::Set qw(parse);

    $regex = parse('[\p{Latin} & \p{L&} - A-Z]');

=head1 DESCRIPTION

Perl 5.8.0 misses subtraction and intersection of characters,
which is described in Unicode Regular Expressions (UTS #18).
This module provides a mimic syntax of character classes
including subtraction and intersection,
taking advantage of look-ahead assertions.

The syntax provided by this module is considerably incompatible
with the standard Perl's regex syntax.

Any whitespace character (that matches C</\s/>) is allowed between any tokens.
Square brackets (C<'['> and C<']'>) are used for grouping.
A literal whitespace and square brackets must be backslashed
(escaped with a backslash, C<'\'>).
You cannot put literal C<']'> at the start of a group.

A POSIX-style character class like C<[:alpha:]> is allowed
since its C<'['> is not a literal.

SEPARATORS (C<'&'> for intersection, C<'|'> for union, and C<'-'>
for subtraction) should be enclosed with one or more whitespaces.
E.g. C<[A&Z]> is a list of C<'A'>, C<'&'>, C<'Z'>.
C<[A-Z]> is a character range from C<'A'> to C<'Z'>.
C<[A-Z - Z]> is a set by removal of C<[Z]> from C<[A-Z]>.

Union operator C<'|'> may be omitted.
E.g. C<[A-Z | a-z]> is equivalent to C<[A-Z a-z]>,
and also to C<[A-Za-z]>.

Intersection operator C<'&'> has high precedence,
so C<[\p{A} \p{B} & \p{C} \p{D}]> is equivalent to
C<[\p{A} | [\p{B} & \p{C}] | \p{D}]>.

Subtraction operator C<'-'> has low precedence,
so C<[\p{A} \p{B} - \p{C} \p{D}]> is equivalent to
C<[[\p{A} | \p{B}] - [\p{C} | \p{D}] ]>.

C<[\p{A} - \p{B} - \p{C}]> is a set
by removal of C<\p{B}> and C<\p{C}> from C<\p{A}>.
It is equivalent to C<[\p{A} - [\p{B} \p{C}]]> and C<[\p{A} - \p{B} \p{C}]>.

Negation. when C<'^'> just after a group-opening C<'['>,
i.e. when they are combined as C<'[^'>, all the tokens following are negated.
E.g. C<[^A-Z a-z]> matches anything but neither C<[A-Z]> nor C<[a-z]>.
More clearly you can say this with grouping as C<[^ [A-Z a-z]]>.

If C<'^'> that is not next to C<'['> is prefixed
to a sequence of literal characters, character ranges,
and/or metacharacters, such a C<'^'> only negates that sequence;
e.g. C<[A-Z ^\p{Latin}]> matches C<A-Z> or a non-Latin character.
But C<[A-Z [^\p{Latin}]]> (or C<[A-Z \P{Latin}]>, for this is a simple case)
is recommended for clarity.

If you want to remove anything other than C<PERL> from C<[A-Z]>,
use C<[A-Z & PERL]> as well as C<[A-Z - [^PERL]]>.
Similarly, if you want to intersect C<[A-Z]> and a thing not C<JUNK>,
use C<[A-Z - JUNK]> as well as  C<[A-Z & [^JUNK]]>.

For further examples, please see tests.

=head1 FUNCTION

=over 4

=item C<$perl_regex = parse($unicode_character_class)>

parses a Character Class pattern according to F<Unicode Regular Expressions>
and converts it into a regular expression in Perl (returned as a string).

=back

=head1 AUTHOR

SADAHIRO Tomoyuki <[email protected]>

Copyright(C) 2003, SADAHIRO Tomoyuki. Japan. All rights reserved.

This module is free software; you can redistribute it
and/or modify it under the same terms as Perl itself.

=head1 SEE ALSO

=over 4

=item http://www.unicode.org/unicode/reports/tr18/

Unicode Regular Expression Guidelines - UTR #18
(to be Unicode Regular Expressions - UTS #18)

=back

=cut