package Lingua::ES::Hyphenate;

package Lingua::ES::Hyphenate;

use strict;
use warnings;

require Exporter;

our @ISA = qw(Exporter);

our %EXPORT_TAGS = ( 'all' => [ qw(
hyphenate
syllable_cnt
) ] );

our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );

our @EXPORT = qw(
hyphenate
syllable_cnt
);

our $VERSION = '.01';

=head1 NAME

Lingua::ES::Hyphenate - Separates Spanish words into syllables

=head1 SYNOPSIS

use Lingua::ES::Hyphenate;

@syllables = hyphenate('oportunidad')# @syllables now holds ('o','por','tu','ni','dad')

# or

$word = new Lingua::ES::Hyphenate->new('oportunidad');
@syllables = $word->hyphenate;

=head1 DESCRIPTION

Separates Spanish words into syllables.

=head1 SPANISH SYLLABLE STRUCTURE

<From Wikipedia>
The Spanish syllable structure can be summarized as follows: C1 C2 S1 V S2 C3 C4
Spanish syllable structure allows a maximum of two consonants in its onset,
a nucleus of a vowel followed by and/or preceded by a semivowel,
and a maximum of two consonants in its coda.
The following restrictions apply:
Onset
First consonant (C1): Can be any consonant.
Second consonant (C2): If and only if the first consonant is a plosive
/p, t, k, b, d, g/ or a voiceless labiodental fricative /f/,
then the second consonant can be a liquid /l, r/.
Although they occur, the onsets /tl/ and /dl/ are not native to Spanish.
Nucleus
Semivowel (S1)
Vowel (V)
Semivowel (S2)
Coda
First consonant (C3): Can be any consonant.
Second consonant (C4): Must be /s/.

=head1 SEE ALSO

http://en.wikipedia.org/wiki/Spanish_phonology#Phonotactics

=cut

my $cnt;# global variable for number of syllables in last parsed word

my $letters = qr/[A�BCDE�FGHI�JKLMN��OPQRSTU�VWXYZ]/i;# Apparently perl doesn't know that � is lowercase for �
#prevent backtracking here; otherwise two letter consonants won't work.
my $anyCons = qr/(?>RR|LL|CH|QU|[BCDFGHJKLMN�PQRSTVWXYZ])/i;# any consonant

my $preR = qr/[PKCBGFTD]/i; # These may precede R in an onset
my $preL = qr/[PKCBGF]/i; # These may precede L in an onset
my $C2 = qr/
(?<=^$preR)L # At the beginning of a word, a TL or DL (loan words)
| # or
(?<=$preR)R # PR KR CR BR GR FR TR DR
| # OR
(?<=$preL)L # PL KL CL BL GL FL
/ix;#
my $onset = qr/$anyCons$C2?/i;# C2 is optional

my $semiVowel = qr/[UI]/i;
my $vowel = qr/[A�E�O��]/i;
my $allVows = qr/[UIA�E�O��]/i;
my $nucleus = qr/(?:$semiVowel?$vowel$semiVowel?)|$semiVowel/i;

my $coda = qr/${anyCons}S?/i;# separate $C4 variable seemed worthless.

my $syllable = qr/
$onset? # onsets are optional
$nucleus # nuclei are not optional
(?: $coda
# We must make sure that the letters after the coda cannot be an
# onset to another syllable; if they are, we forget the coda and
# parse the next consonants as the onset of the next syllable.
(?(?<=$preL) # IF the matched $coda was a pre L consonant
(?!L) # don't match a following L
)
(?(?<=$preR) # IF the matched $coda was a pre R consonant
(?!R) # don't match a following R
)
(?!$allVows) # don't match a following vowel or semivowel
)? # coda is optional
/ix;# ignore case

=head1 CONSTRUCTOR

Not necessary, since functions are exported.

my $hyphenater = Lingua::ES::Hyphenate->new('charlar');

=cut

sub new {
my ($self, $word) = @_;
bless \$word, $self;
}

=head1 hyphenate

Returns array of syllables from input word.

my $hyphenater = Lingua::ES::Hyphenate->new('charlar');
@syllabes = $hyphenater->hyphenate();

# or

@syllables = hyphenate('tomarlo')

=cut

sub hyphenate {
$_[0] || return ();

my $word;
if (ref($_[0]) eq 'Lingua::ES::Hyphenate')
{
my $self = shift;
$word = $$self;
}
else
{
$word = shift;
}
$word =~ /^$letters+$/ || return ();
$cnt = $word =~ s/$syllable/$&=/g;
split '=', $word;
}

=head1 syllable_cnt

Returns number of syllables in string argument.
If no argument is provided, returns the number of
syllables in the last word parsed.

my $cnt = syllable_cnt('tomarlo');

# or

my $hyphenater = Lingua::ES::Hyphenate->new('charlar');
my $cnt = $hyphenater->syllable_cnt('escuela');

# or

my @syllables = hyphenate('majader�as');
$cnt = syllable_cnt();
# same as
$cnt = @syllables;

=cut

sub syllable_cnt{
my $word = '';
if (ref($_[0]) eq 'Lingua::ES::Hyphenate')
{
my $self = shift;
$word = $$self;
$cnt = $word =~ s/$syllable//g;
return $cnt;
}
elsif(@_ == 1)
{
$word = shift;
}
if($word ne '')
{
$cnt = $word =~ s/$syllable//g;
return $cnt;
}
return $cnt; # default: return number of syllables in last word
}

1;
=head1 AUTHOR

Nathan Glenn, <[email protected]>

=head1 COPYRIGHT AND LICENSE

Copyright 2010 by Nathan Glenn

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=head1 NEEDS WORK

Atlanta splits as 'A-tlan-ta'. Is that correct? 'tl' and 'dl' and not
native sounds, and Atlanta is a lone word, so maybe it's okay.
'At-lan-ta' seems more natural to me.

=cut