package Lingua::ES::Hyphenate;
use strict;
use warnings;
require Exporter;
our @ISA = qw(Exporter);
our %EXPORT_TAGS = ( 'all' => [ qw(
hyphenate
syllable_cnt
) ] );
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
our @EXPORT = qw(
hyphenate
syllable_cnt
);
our $VERSION = '.01';
=head1 NAME
Lingua::ES::Hyphenate - Separates Spanish words into syllables
=head1 SYNOPSIS
use Lingua::ES::Hyphenate;
@syllables = hyphenate('oportunidad')# @syllables now holds ('o','por','tu','ni','dad')
# or
$word = new Lingua::ES::Hyphenate->new('oportunidad');
@syllables = $word->hyphenate;
=head1 DESCRIPTION
Separates Spanish words into syllables.
=head1 SPANISH SYLLABLE STRUCTURE
<From Wikipedia>
The Spanish syllable structure can be summarized as follows: C1 C2 S1 V S2 C3 C4
Spanish syllable structure allows a maximum of two consonants in its onset,
a nucleus of a vowel followed by and/or preceded by a semivowel,
and a maximum of two consonants in its coda.
The following restrictions apply:
Onset
First consonant (C1): Can be any consonant.
Second consonant (C2): If and only if the first consonant is a plosive
/p, t, k, b, d, g/ or a voiceless labiodental fricative /f/,
then the second consonant can be a liquid /l, r/.
Although they occur, the onsets /tl/ and /dl/ are not native to Spanish.
Nucleus
Semivowel (S1)
Vowel (V)
Semivowel (S2)
Coda
First consonant (C3): Can be any consonant.
Second consonant (C4): Must be /s/.
=head1 SEE ALSO
http://en.wikipedia.org/wiki/Spanish_phonology#Phonotactics
=cut
my $cnt;# global variable for number of syllables in last parsed word
my $letters = qr/[A�BCDE�FGHI�JKLMN��OPQRSTU�VWXYZ]/i;# Apparently perl doesn't know that � is lowercase for �
#prevent backtracking here; otherwise two letter consonants won't work.
my $anyCons = qr/(?>RR|LL|CH|QU|[BCDFGHJKLMN�PQRSTVWXYZ])/i;# any consonant
my $preR = qr/[PKCBGFTD]/i; # These may precede R in an onset
my $preL = qr/[PKCBGF]/i; # These may precede L in an onset
my $C2 = qr/
(?<=^$preR)L # At the beginning of a word, a TL or DL (loan words)
| # or
(?<=$preR)R # PR KR CR BR GR FR TR DR
| # OR
(?<=$preL)L # PL KL CL BL GL FL
/ix;#
my $onset = qr/$anyCons$C2?/i;# C2 is optional
my $semiVowel = qr/[UI]/i;
my $vowel = qr/[A�E�O���]/i;
my $allVows = qr/[UIA�E�O���]/i;
my $nucleus = qr/(?:$semiVowel?$vowel$semiVowel?)|$semiVowel/i;
my $coda = qr/${anyCons}S?/i;# separate $C4 variable seemed worthless.
my $syllable = qr/
$onset? # onsets are optional
$nucleus # nuclei are not optional
(?: $coda
# We must make sure that the letters after the coda cannot be an
# onset to another syllable; if they are, we forget the coda and
# parse the next consonants as the onset of the next syllable.
(?(?<=$preL) # IF the matched $coda was a pre L consonant
(?!L) # don't match a following L
)
(?(?<=$preR) # IF the matched $coda was a pre R consonant
(?!R) # don't match a following R
)
(?!$allVows) # don't match a following vowel or semivowel
)? # coda is optional
/ix;# ignore case
=head1 CONSTRUCTOR
Not necessary, since functions are exported.
my $hyphenater = Lingua::ES::Hyphenate->new('charlar');
=cut
sub new {
my ($self, $word) = @_;
bless \$word, $self;
}
=head1 hyphenate
Returns array of syllables from input word.
my $hyphenater = Lingua::ES::Hyphenate->new('charlar');
@syllabes = $hyphenater->hyphenate();
# or
@syllables = hyphenate('tomarlo')
=cut
sub hyphenate {
$_[0] || return ();
my $word;
if (ref($_[0]) eq 'Lingua::ES::Hyphenate')
{
my $self = shift;
$word = $$self;
}
else
{
$word = shift;
}
$word =~ /^$letters+$/ || return ();
$cnt = $word =~ s/$syllable/$&=/g;
split '=', $word;
}
=head1 syllable_cnt
Returns number of syllables in string argument.
If no argument is provided, returns the number of
syllables in the last word parsed.
my $cnt = syllable_cnt('tomarlo');
# or
my $hyphenater = Lingua::ES::Hyphenate->new('charlar');
my $cnt = $hyphenater->syllable_cnt('escuela');
# or
my @syllables = hyphenate('majader�as');
$cnt = syllable_cnt();
# same as
$cnt = @syllables;
=cut
sub syllable_cnt{
my $word = '';
if (ref($_[0]) eq 'Lingua::ES::Hyphenate')
{
my $self = shift;
$word = $$self;
$cnt = $word =~ s/$syllable//g;
return $cnt;
}
elsif(@_ == 1)
{
$word = shift;
}
if($word ne '')
{
$cnt = $word =~ s/$syllable//g;
return $cnt;
}
return $cnt; # default: return number of syllables in last word
}
1;
=head1 AUTHOR
Nathan Glenn, <
[email protected]>
=head1 COPYRIGHT AND LICENSE
Copyright 2010 by Nathan Glenn
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=head1 NEEDS WORK
Atlanta splits as 'A-tlan-ta'. Is that correct? 'tl' and 'dl' and not
native sounds, and Atlanta is a lone word, so maybe it's okay.
'At-lan-ta' seems more natural to me.
=cut