#!/usr/bin/perl -w

use strict;
use vars qw($VERSION);

require 5.005;

$VERSION = sprintf "%d.%d", q$Revision: 1.48 $ =~ /(\d+)/g; # [email protected]

use Getopt::Long;

my $n = 1;
my ($help, $debug, $input, $latin1, $output, $tables, $version);
my $fold = 1;

sub help {
   print <<EOF;
Usage: $0 [options] [files]
Compute the ngram frequencies and output the word lengths, ngrams,
initials ngrams, medial (non-initial non-final) ngrams, and final
ngrams.  The frequencies will be shown as absolute, as relative to
the total frequency, and as relative to the maximum frequency.
Options:
--n=N           The default is one character ngrams.
--nofold        The default is to lowercase the ngrams.
--input=C       The default is to use native 8-bit bytes and native definition
               of "letters".  Using this option requires Perl 5.8.
               The input will be mapped from C to Unicode, and the Unicode
               Letter definition will be used for "letters".
               For example to input ISO 8859-1 characters, use -i=latin1.
--latin1        Assume the input to be ISO 8859-1.  Does not require Perl 5.8.
--output=F      Output to files F.ngl, F.ngr, F.ngi, F.ngm, F.ngf.
               The default is to use STDOUT.
--tables        Instead of a linear list of ngram frequencies output a list
               of two-dimensional tables of ngram frequencies (for n > 1).
--help          Show this help.
--version       Show version.
--debug         Show the ngrams in STDERR.
The options can be shortened to their unique prefixes and
the two dashes to one dash.  No files means using STDIN.
EOF
   exit(1);
}

help()
   unless
     GetOptions('n=i'        => \$n,
                'fold!'      => \$fold,
                'input=s'    => \$input,
                'latin1'     => \$latin1,
                'output=s'   => \$output,
                'tables'     => \$tables,
                'help'       => \$help,
                'version'    => \$version,
                'debug'      => \$debug);

help() if $n < 1 || int($n) != $n;

sub version {
   print $VERSION, "\n";
   exit(1);
}

help()    if $help;
version() if $version;

require 5.008 if $input;

my %length_freq;
my %ngram_freq;
my %initial_freq;
my %medial_freq;
my %final_freq;

@ARGV = ("-") unless @ARGV;

my $letter = $latin1 ?
   '[A-Za-z�����������������������������������������������������������]' :
   $] >= 5.008 ? '\pL' : '\w';

for my $fn (@ARGV) {
   if (open(my $fh, $fn)) {
       binmode($fh, ":encoding($input)") if $input;
       while (<$fh>) {
           chomp;
           while (/($letter+)/go) {
               my $s = $1;
               my $l = length $s;
               $length_freq{$l}++;
               if ($l >= $n) {
                   if ($fold) {
                       if ($latin1) {
                           $s =~ tr/ABCDEFGHIJKLMNOPQRSTUVWXYZ������������������������������/abcdefghijklmnopqrstuvwxyz���������������������������/;
                       } else {
                           $s = lc $s;
                       }
                   }
                   my @n = ();
                   while ($s =~ /(.{$n})/g) {
                       $ngram_freq{$1}++;
                       push @n, $1;
                       pos($s) -= $n - 1;
                   }
                   print STDERR "@n\n" if $debug;
                   $initial_freq{my $ini = shift @n}++;
                   $final_freq  {@n ? pop @n : $ini}++;
                   $medial_freq {$_}++ for @n;
               }
           }
       }
   } else {
       warn "$0: Failed to open '$fn' for reading: $!\n";
   }
}

sub summax {
   my $freq = shift;

   my $max = 0;
   my $sum = 0;
   for my $a (keys %{$freq}) {
       next unless defined $freq->{$a};
       $max  = $freq->{$a} if $freq->{$a} > $max;
       $sum += $freq->{$a};
   }

   return ($sum, $max);
}

sub table_show {
   my ($out, $freq, $m, $sum, $max, $n, $s) = @_;
   if ($n == 2) {
       print $out length $s ? $s : " ";
       for my $b (@$m) {
           print $out "\t$b";
       }
       print $out "\n";
       for my $a (@$m) {
           print $out $a;
           for my $b (@$m) {
               my $sab = $s . $a . $b;
               my $f = $freq->{$sab} || 0;
               printf $out "\t%6d", $f;
           }
           print $out "\n";
       }
   } else {
       for my $a (@$m) {
           table_show($out, $freq, $m, $sum, $max, $n - 1, $s . $a);
       }
   }
}

sub show {
   my ($out, $freq, $k, $m, $title) = @_;

   print $out "$title Frequencies\n";
   my ($sum, $max) = summax($freq);
   if ($tables && defined $m && $n > 1) {
       table_show($out, $freq, $m, $sum, $max, $n, '');
   } else {
       for my $a (@{ $k }) {
           my $f = $freq->{$a} || 0;
           printf $out "%${n}s\t%6d\t%9.6f\t%.6f\n",
                  $a, $f, $sum ? $f / $sum : 0, $max ? $f / $max : 0;
       }
       printf $out "%${n}s\t%6d\t%9.6f\n",
              "Sum", $sum, 1.0;
   }
}

my $outputfh = *STDOUT;

binmode(STDOUT, ":encoding($input)") if $input && !defined $output;

sub output_open {
   my $suffix = shift;
   my $outputfn = "$output.ng$suffix";
   if (open($outputfh, ">$outputfn")) {
       binmode($outputfh, ":encoding($input)") if $input;
   } else {
       warn "$0: Failed to open '$outputfn' for writing: $!\n";
   }
}

sub freq_show {
   my ($outputfh, $freq, $k, $m, $suffix, $title) = @_;
   output_open($suffix) if defined $output;
   show($outputfh, $freq, $k, $m, $title);
}

my @l = sort { $a <=> $b } keys %length_freq;

freq_show($outputfh, \%length_freq,
         @l ? [ 1 .. $l[-1] ] : [], undef, 'l', "Word Length");

my @k = sort keys %ngram_freq;
my @m;

if ($tables && $n > 1) {
   my %s;
   @s{ split // } = () for @k;
   @m = sort keys %s;
}

freq_show($outputfh, \%ngram_freq,   \@k, \@m, 'r', "Ngram");
freq_show($outputfh, \%initial_freq, \@k, \@m, 'i', "Initial Ngram");
freq_show($outputfh, \%medial_freq,  \@k, \@m, 'm', "Medial Ngram");
freq_show($outputfh, \%final_freq,   \@k, \@m, 'f', "Final Ngram");

exit(0);

__END__
=head1 NAME

ngram - compute and display frequencies of ngrams

=head1 SYNOPIS

 ngram [--help] [--n=2] [input files]

=head1 DESCRIPTION

This script computes and display the ngram frequencies in its input.
If no ngram length is specified, 1 (one) is assumed.
If no files are specified, STDIN is read.

=head2 Sample Output

With the input C<banana and bandana> and ngram length of one would get
the following output.  The initial, medial, and final ngrams refer to
I<word-initial>, and so forth -- but note that the script doesn't have
artificial intelligence built in: C<doesn't> is two words, as is
C<inter-continental>.  The C<#> are comments added here for the sake
of documentation, they are not part of the real output.

 Word Length Frequencies
 1          0   0.000000       0.000000
 2          0   0.000000       0.000000
 3          1   0.333333       1.000000 # We had one word of length 3.
 4          0   0.000000       0.000000
 5          0   0.000000       0.000000
 6          1   0.333333       1.000000
 7          1   0.333333       1.000000
 Sum        3   1.000000                # We had three words total.
 Ngram Frequencies
 a          7   0.437500       1.000000 # We had seven letters "a".
 b          2   0.125000       0.285714
 d          2   0.125000       0.285714
 n          5   0.312500       0.714286
 Sum       16   1.000000                # We had sixteen letters total.
 Initial Ngram Frequencies
 a          1   0.333333       0.500000 # We had one word beginning with "a".
 b          2   0.666667       1.000000 # We had two words beginning with "b".
 d          0   0.000000       0.000000 # We had no words beginning with "d".
 n          0   0.000000       0.000000
 Sum        3   1.000000
 Medial Ngram Frequencies
 a          4   0.400000       0.800000
 b          0   0.000000       0.000000 # We had no "b"s in the middle.
 d          1   0.100000       0.200000 # We only one "d" in the middle.
 n          5   0.500000       1.000000
 Sum       10   1.000000
 Final Ngram Frequencies
 a          2   0.666667       1.000000
 b          0   0.000000       0.000000 # We had no "b"s ending words.
 d          1   0.333333       0.500000
 n          0   0.000000       0.000000
 Sum        3   1.000000

The sum of initial, medial, and final ngram frequencies equals the
number of all ngrams.

=head2 Options

 --n=N

The default is to use one character ngrams, in other words, single
characters.  With the C<-n> option you can change the ngram length.
I<N> must be greater than or equal to one.

 --nofold

The default is to lowercase all the letters before counting the frequencies:
for example C<Ab> is lowercased to C<ab>.  With the C<--nofold> option this
lowercasing is not done.

 --input=C

The default is to use the native 8-bit bytes and the native definition of
"letters".  With the C<--input> option any character set and encoding
recognized by the Encode extension can be used as the input.  Using this
option unfortunately requires at least Perl 5.8.0, but for a common
character set see the C<--latin1> option.

 --latin1

Assume the input to be in ISO 8859-1 (Latin 1).  Using this option
does not require Perl 5.8.

 --output=F

Output the resulting ngram frequencies to files C<F.ngl>, C<F.ngr>,
C<F.ngi>, C<F.ngm>, C<F.ngf> for the lengths, ngrams, initial ngrams,
medial ngrams, and final ngrams, respectively.  The default is to output
all the results to the standard output, and in a different order (lengths,
ngrams, initial ngrams, medial ngrams, final ngrams).

 --tables

Relevant only if I<N> is two or more: usually a linear list of the
ngram frequencies is output, but with the C<--table> option a list of
two-dimensional tables is shown.  This may make it easier to visualize
the distribution of the ngrams.  The relative frequencies and sums are
not shown, only the absolute frequencies.

 --help

Show a concise help message.

 --version

Show version of the F<ngram.pl> script.

 --debug

Output various debugging information.  Currently shows the detected
ngrams to the standard error output.

All the options can be shortened to their unique prefixes, and also the
leading C<--> be shortened to a single C<->.

=head1 PREREQUISITES

Getopt::Long
strict
vars

=head1 COREQUISITES

Encode

=head1 SCRIPT CATEGORIES

Text::Statistics

=head1 README

Compute ngram (one letter, digram, trigram, ...) frequencies.
Useful for generating text using Markov chains or for cryptogeeks.

=head1 SEE ALSO

Simon Cozen's Text::Ngram module in CPAN

=head1 COPYRIGHT

(C) 2003 by Jarkko Hietaniemi <[email protected]>

All rights reserved. You may distribute this code under the terms
of either the GNU General Public License or the Artistic License,
as specified in the Perl README file.

=cut