#!/usr/bin/perl -w
use strict;
use Getopt::Long;
use File::Find;
use LWP::Simple;
use HTML::LinkExtor;
use URI::URL;

my (%options,
       );
use vars qw($directory $extension $frequency $depth
           $ignore $keywords $logfile $output
           $verbose $help $recurse @words $words
           $URL %count $parser %Docs
           $p $host
               );

my ($word);

Getopt::Long::Configure("bundling");
GetOptions(
"h+"  => \$help,
"v+"  => \$verbose,
"r+"  => \$recurse, # For file searches
"f+"  => \$frequency,
"H+"  => \$host,
"d:s" => \$directory,
"e:s" => \$extension,
"l:s" => \$logfile,
"o:s" => \$output,
"D:i" => \$depth, # For web searches
);

$help && Usage();
@words = @ARGV;
chomp @words;
Usage() unless @words;
$words = join '|', @words;
if ($logfile) {
       open (LOG, ">$logfile") ||
                    die("Could not open $logfile for writing: $!\n");
}# End if

if ($directory) {
       # Search local file system
       $directory =~ s!/$!!;
       $verbose && print "Searching $directory\n";
       $logfile && print LOG "Searching $directory\n";
       if ($recurse) {
           find(\&search, $directory);
       } else {
               opendir (DIR, $directory) || die
                       "Could not read $directory: $!\n";
               my @files = readdir DIR;
               closedir DIR;
               @files = grep (-T "$directory/$_", @files);
               if ($extension) {
                       $extension =~ s!\.!\\.!g;
                       @files = grep (/$extension$/, @files) ;
               }
               $verbose && print "Found files: ", (join ", ", @files), "\n";

               search("$directory/$_") for @files;
       } # End else

} elsif (($URL = shift @words) =~ m!^(ht|f)tp://!)      {
       # Search WWW
       $p = HTML::LinkExtor->new();
       if ($host) {
               ($host = $URL) =~ s!((.*?)//(.*?))/.*$!$1!;
               $verbose && print "Restricting queries to pages on $host\n";
       }

       searchpage(0, $URL);
       for (keys %Docs) {
               $verbose && print "Looked at $_\n";
       }
} else {
       Usage();
}

# Results
if ($output)    {
       open (OUT, ">$output") ||
                       die ("Could not open $output for writing: $!\n");
} # End if

if ($frequency) { # With -f
       for (sort keys %count)  { # The keys of %count are file names
               for $word (sort keys %{$count{$_}})     {
                       if ($output) {
                               print OUT "$_|$word|$count{$_}{$word}\n";
                               $verbose && print"$_|$word|$count{$_}{$word}\n";
                       } else {
                               print "$_|$word|$count{$_}{$word}\n";
                       }  # End if..else ($output)
               } # End for words
       } # End for files
} else { # Without -f
       for (sort keys %count)  {
               if ($output) {
                       print OUT "$_|";
                       $verbose && print "$_|";
               } else {
                       print "$_|";
               } # End else
               my $found = join ',', sort (keys %{$count{$_}});
               if ($output)    {
                       print OUT "$found\n";
                       $verbose && print "$found\n";
               } else {
                       print "$found\n";
               }
       } # End for files
} #  End (not -f)

close LOG if $logfile;
close OUT if $output;

sub search {
       # Search a text file for a string
       my $file = $File::Find::name || shift;
       -B $file && return;
       my ($line);

       $verbose && print "Searching $file\n";
       open (FILE, $file);
       while ($line = <FILE>)  {
               if ($line =~/$words/)   {
                       $verbose && print " $.:\t$line";
                       map{$count{$file}{$_}++ if $line=~/$_/}@words;
               }
       }
       close FILE;
} # End sub search

sub searchpage  {
       my ($cur_depth, $url) = @_;
       my ($link, @links, $abs);

       $verbose && print "Looking at $url, at depth $cur_depth\n";
       $Docs{$url} = 1;

       return(0) if ($cur_depth > $depth);
       my $content = get($url);
       if ($content=~m/$words/is)      {
               map{my $tmp=($content=~s/($_)/$1/gis);
                        $count{$url}{$_}=$tmp if $tmp}@words;
       } # End if
       $p->parse($content);
       @links = $p->links;
       for $link (@links)  {
               $abs = url($link->[2], $url)->abs if
                        ($link->[0] eq 'a' && $link->[1] eq 'href');
               $abs =~ s/#.*$//;
               $abs =~ s!/$!!;

               # Skip some URLs
               next if $abs=~/^mailto/i;
               next if $abs=~/(gz|zip|exe|tar|Z)$/;
               next unless $abs;
               next unless ($abs =~ /^$host/);
               next if $abs=~/\?\S+?=\S+/;

               searchpage($cur_depth+1, $abs) unless
                       ($Docs{$abs} || ($cur_depth+1 > $depth));
       }
} # End sub searchpage

sub Usage {
print <<EndUsage;

Usage: keywordsearch [-rfv] [-e suffix] [-k number] [-i ignorefile]
                    [-l log] [-o outfile] -d dir keywords
      keywordsearch [-fv] [-k number] [-l log] [-o outfile]
                    [-i ignorefile] [-D depth] URL keywords
Options:
   -d          Directory to search
   -e          File extension to search (i.e. .txt .html)
               Default behavior is to search only text-type files
   -f          Output the frequency of the words
   -D          Depth of search, or how many levels down to go
               (Web search) (Default is 0)
   -r          Recurse subdirectories (File search)
   -l          Log file for problems found
   -o          Output file (Default is STDOUT)
   -v          Verbose
   -H          Restrict to the initial host (Web search)

EndUsage
exit(0);
}

=head1 NAME

keywordsearch - Searches a web site, or a local directory, for keyword(s)

=head1 DESCRIPTION

Simple command line tool for searching either a web site, or a local directory,
for documents that contain a particular word.

=head1 PREREQUISITES

Uses C<Getopt::Long>, C<LWP::Simple>, C<HTML::LinkExtor> and C<URI::URL>

=head1 COREQUISITES

None

=head1 README

Simple command-line tool for searching either a web site, or a local
directory, for documents containing particular keyword(s).

=pod OSNAMES

Any

=pod SCRIPT CATEGORIES

Search

=head1 Author

Written by Rich Bowen <[email protected]> for The Creative Group
(<http://www.cre8tivegroup.com>)

=cut