use strict;

#!/usr/bin/perl -w

use strict;
use vars qw($VERSION);

require 5.005;

$VERSION = sprintf "%d.%d", q$Revision: 1.48 $ =~ /(\d+)/g; # [email protected]

use Getopt::Long;

my $n = 1;
my ($help, $debug, $input, $latin1, $output, $tables, $version);
my $fold = 1;

sub help {
print <<EOF;
Usage: $0 [options] [files]
Compute the ngram frequencies and output the word lengths, ngrams,
initials ngrams, medial (non-initial non-final) ngrams, and final
ngrams. The frequencies will be shown as absolute, as relative to
the total frequency, and as relative to the maximum frequency.
Options:
--n=N The default is one character ngrams.
--nofold The default is to lowercase the ngrams.
--input=C The default is to use native 8-bit bytes and native definition
of "letters". Using this option requires Perl 5.8.
The input will be mapped from C to Unicode, and the Unicode
Letter definition will be used for "letters".
For example to input ISO 8859-1 characters, use -i=latin1.
--latin1 Assume the input to be ISO 8859-1. Does not require Perl 5.8.
--output=F Output to files F.ngl, F.ngr, F.ngi, F.ngm, F.ngf.
The default is to use STDOUT.
--tables Instead of a linear list of ngram frequencies output a list
of two-dimensional tables of ngram frequencies (for n > 1).
--help Show this help.
--version Show version.
--debug Show the ngrams in STDERR.
The options can be shortened to their unique prefixes and
the two dashes to one dash. No files means using STDIN.
EOF
exit(1);
}

help()
unless
GetOptions('n=i' => \$n,
'fold!' => \$fold,
'input=s' => \$input,
'latin1' => \$latin1,
'output=s' => \$output,
'tables' => \$tables,
'help' => \$help,
'version' => \$version,
'debug' => \$debug);

help() if $n < 1 || int($n) != $n;

sub version {
print $VERSION, "\n";
exit(1);
}

help() if $help;
version() if $version;

require 5.008 if $input;

my %length_freq;
my %ngram_freq;
my %initial_freq;
my %medial_freq;
my %final_freq;

@ARGV = ("-") unless @ARGV;

my $letter = $latin1 ?
'[A-Za-z��]' :
$] >= 5.008 ? '\pL' : '\w';

for my $fn (@ARGV) {
if (open(my $fh, $fn)) {
binmode($fh, ":encoding($input)") if $input;
while (<$fh>) {
chomp;
while (/($letter+)/go) {
my $s = $1;
my $l = length $s;
$length_freq{$l}++;
if ($l >= $n) {
if ($fold) {
if ($latin1) {
$s =~ tr/ABCDEFGHIJKLMNOPQRSTUVWXYZ��/abcdefghijklmnopqrstuvwxyz��/;
} else {
$s = lc $s;
}
}
my @n = ();
while ($s =~ /(.{$n})/g) {
$ngram_freq{$1}++;
push @n, $1;
pos($s) -= $n - 1;
}
print STDERR "@n\n" if $debug;
$initial_freq{my $ini = shift @n}++;
$final_freq {@n ? pop @n : $ini}++;
$medial_freq {$_}++ for @n;
}
}
}
} else {
warn "$0: Failed to open '$fn' for reading: $!\n";
}
}

sub summax {
my $freq = shift;

my $max = 0;
my $sum = 0;
for my $a (keys %{$freq}) {
next unless defined $freq->{$a};
$max = $freq->{$a} if $freq->{$a} > $max;
$sum += $freq->{$a};
}

return ($sum, $max);
}

sub table_show {
my ($out, $freq, $m, $sum, $max, $n, $s) = @_;
if ($n == 2) {
print $out length $s ? $s : " ";
for my $b (@$m) {
print $out "\t$b";
}
print $out "\n";
for my $a (@$m) {
print $out $a;
for my $b (@$m) {
my $sab = $s . $a . $b;
my $f = $freq->{$sab} || 0;
printf $out "\t%6d", $f;
}
print $out "\n";
}
} else {
for my $a (@$m) {
table_show($out, $freq, $m, $sum, $max, $n - 1, $s . $a);
}
}
}

sub show {
my ($out, $freq, $k, $m, $title) = @_;

print $out "$title Frequencies\n";
my ($sum, $max) = summax($freq);
if ($tables && defined $m && $n > 1) {
table_show($out, $freq, $m, $sum, $max, $n, '');
} else {
for my $a (@{ $k }) {
my $f = $freq->{$a} || 0;
printf $out "%${n}s\t%6d\t%9.6f\t%.6f\n",
$a, $f, $sum ? $f / $sum : 0, $max ? $f / $max : 0;
}
printf $out "%${n}s\t%6d\t%9.6f\n",
"Sum", $sum, 1.0;
}
}

my $outputfh = *STDOUT;

binmode(STDOUT, ":encoding($input)") if $input && !defined $output;

sub output_open {
my $suffix = shift;
my $outputfn = "$output.ng$suffix";
if (open($outputfh, ">$outputfn")) {
binmode($outputfh, ":encoding($input)") if $input;
} else {
warn "$0: Failed to open '$outputfn' for writing: $!\n";
}
}

sub freq_show {
my ($outputfh, $freq, $k, $m, $suffix, $title) = @_;
output_open($suffix) if defined $output;
show($outputfh, $freq, $k, $m, $title);
}

my @l = sort { $a <=> $b } keys %length_freq;

freq_show($outputfh, \%length_freq,
@l ? [ 1 .. $l[-1] ] : [], undef, 'l', "Word Length");

my @k = sort keys %ngram_freq;
my @m;

if ($tables && $n > 1) {
my %s;
@s{ split // } = () for @k;
@m = sort keys %s;
}

freq_show($outputfh, \%ngram_freq, \@k, \@m, 'r', "Ngram");
freq_show($outputfh, \%initial_freq, \@k, \@m, 'i', "Initial Ngram");
freq_show($outputfh, \%medial_freq, \@k, \@m, 'm', "Medial Ngram");
freq_show($outputfh, \%final_freq, \@k, \@m, 'f', "Final Ngram");

exit(0);

__END__
=head1 NAME

ngram - compute and display frequencies of ngrams

=head1 SYNOPIS

ngram [--help] [--n=2] [input files]

=head1 DESCRIPTION

This script computes and display the ngram frequencies in its input.
If no ngram length is specified, 1 (one) is assumed.
If no files are specified, STDIN is read.

=head2 Sample Output

With the input C<banana and bandana> and ngram length of one would get
the following output. The initial, medial, and final ngrams refer to
I<word-initial>, and so forth -- but note that the script doesn't have
artificial intelligence built in: C<doesn't> is two words, as is
C<inter-continental>. The C<#> are comments added here for the sake
of documentation, they are not part of the real output.

Word Length Frequencies
1 0 0.000000 0.000000
2 0 0.000000 0.000000
3 1 0.333333 1.000000 # We had one word of length 3.
4 0 0.000000 0.000000
5 0 0.000000 0.000000
6 1 0.333333 1.000000
7 1 0.333333 1.000000
Sum 3 1.000000 # We had three words total.
Ngram Frequencies
a 7 0.437500 1.000000 # We had seven letters "a".
b 2 0.125000 0.285714
d 2 0.125000 0.285714
n 5 0.312500 0.714286
Sum 16 1.000000 # We had sixteen letters total.
Initial Ngram Frequencies
a 1 0.333333 0.500000 # We had one word beginning with "a".
b 2 0.666667 1.000000 # We had two words beginning with "b".
d 0 0.000000 0.000000 # We had no words beginning with "d".
n 0 0.000000 0.000000
Sum 3 1.000000
Medial Ngram Frequencies
a 4 0.400000 0.800000
b 0 0.000000 0.000000 # We had no "b"s in the middle.
d 1 0.100000 0.200000 # We only one "d" in the middle.
n 5 0.500000 1.000000
Sum 10 1.000000
Final Ngram Frequencies
a 2 0.666667 1.000000
b 0 0.000000 0.000000 # We had no "b"s ending words.
d 1 0.333333 0.500000
n 0 0.000000 0.000000
Sum 3 1.000000

The sum of initial, medial, and final ngram frequencies equals the
number of all ngrams.

=head2 Options

--n=N

The default is to use one character ngrams, in other words, single
characters. With the C<-n> option you can change the ngram length.
I<N> must be greater than or equal to one.

--nofold

The default is to lowercase all the letters before counting the frequencies:
for example C<Ab> is lowercased to C<ab>. With the C<--nofold> option this
lowercasing is not done.

--input=C

The default is to use the native 8-bit bytes and the native definition of
"letters". With the C<--input> option any character set and encoding
recognized by the Encode extension can be used as the input. Using this
option unfortunately requires at least Perl 5.8.0, but for a common
character set see the C<--latin1> option.

--latin1

Assume the input to be in ISO 8859-1 (Latin 1). Using this option
does not require Perl 5.8.

--output=F

Output the resulting ngram frequencies to files C<F.ngl>, C<F.ngr>,
C<F.ngi>, C<F.ngm>, C<F.ngf> for the lengths, ngrams, initial ngrams,
medial ngrams, and final ngrams, respectively. The default is to output
all the results to the standard output, and in a different order (lengths,
ngrams, initial ngrams, medial ngrams, final ngrams).

--tables

Relevant only if I<N> is two or more: usually a linear list of the
ngram frequencies is output, but with the C<--table> option a list of
two-dimensional tables is shown. This may make it easier to visualize
the distribution of the ngrams. The relative frequencies and sums are
not shown, only the absolute frequencies.

--help

Show a concise help message.

--version

Show version of the F<ngram.pl> script.

--debug

Output various debugging information. Currently shows the detected
ngrams to the standard error output.

All the options can be shortened to their unique prefixes, and also the
leading C<--> be shortened to a single C<->.

=head1 PREREQUISITES

Getopt::Long
strict
vars

=head1 COREQUISITES

Encode

=head1 SCRIPT CATEGORIES

Text::Statistics

=head1 README

Compute ngram (one letter, digram, trigram, ...) frequencies.
Useful for generating text using Markov chains or for cryptogeeks.

=head1 SEE ALSO

Simon Cozen's Text::Ngram module in CPAN

=head1 COPYRIGHT

(C) 2003 by Jarkko Hietaniemi <[email protected]>

All rights reserved. You may distribute this code under the terms
of either the GNU General Public License or the Artistic License,
as specified in the Perl README file.

=cut