#!/usr/bin/perl -w
use strict;
use Getopt::Long;
use File::Find;
use LWP::Simple;
use HTML::LinkExtor;
use URI::URL;
my (%options,
);
use vars qw($directory $extension $frequency $depth
$ignore $keywords $logfile $output
$verbose $help $recurse @words $words
$URL %count $parser %Docs
$p $host
);
my ($word);
Getopt::Long::Configure("bundling");
GetOptions(
"h+" => \$help,
"v+" => \$verbose,
"r+" => \$recurse, # For file searches
"f+" => \$frequency,
"H+" => \$host,
"d:s" => \$directory,
"e:s" => \$extension,
"l:s" => \$logfile,
"o:s" => \$output,
"D:i" => \$depth, # For web searches
);
$help && Usage();
@words = @ARGV;
chomp @words;
Usage() unless @words;
$words = join '|', @words;
if ($logfile) {
open (LOG, ">$logfile") ||
die("Could not open $logfile for writing: $!\n");
}# End if
if ($directory) {
# Search local file system
$directory =~ s!/$!!;
$verbose && print "Searching $directory\n";
$logfile && print LOG "Searching $directory\n";
if ($recurse) {
find(\&search, $directory);
} else {
opendir (DIR, $directory) || die
"Could not read $directory: $!\n";
my @files = readdir DIR;
closedir DIR;
@files = grep (-T "$directory/$_", @files);
if ($extension) {
$extension =~ s!\.!\\.!g;
@files = grep (/$extension$/, @files) ;
}
$verbose && print "Found files: ", (join ", ", @files), "\n";
search("$directory/$_") for @files;
} # End else
} elsif (($URL = shift @words) =~ m!^(ht|f)tp://!) {
# Search WWW
$p = HTML::LinkExtor->new();
if ($host) {
($host = $URL) =~ s!((.*?)//(.*?))/.*$!$1!;
$verbose && print "Restricting queries to pages on $host\n";
}
searchpage(0, $URL);
for (keys %Docs) {
$verbose && print "Looked at $_\n";
}
} else {
Usage();
}
# Results
if ($output) {
open (OUT, ">$output") ||
die ("Could not open $output for writing: $!\n");
} # End if
if ($frequency) { # With -f
for (sort keys %count) { # The keys of %count are file names
for $word (sort keys %{$count{$_}}) {
if ($output) {
print OUT "$_|$word|$count{$_}{$word}\n";
$verbose && print"$_|$word|$count{$_}{$word}\n";
} else {
print "$_|$word|$count{$_}{$word}\n";
} # End if..else ($output)
} # End for words
} # End for files
} else { # Without -f
for (sort keys %count) {
if ($output) {
print OUT "$_|";
$verbose && print "$_|";
} else {
print "$_|";
} # End else
my $found = join ',', sort (keys %{$count{$_}});
if ($output) {
print OUT "$found\n";
$verbose && print "$found\n";
} else {
print "$found\n";
}
} # End for files
} # End (not -f)
close LOG if $logfile;
close OUT if $output;
sub search {
# Search a text file for a string
my $file = $File::Find::name || shift;
-B $file && return;
my ($line);
$verbose && print "Searching $file\n";
open (FILE, $file);
while ($line = <FILE>) {
if ($line =~/$words/) {
$verbose && print " $.:\t$line";
map{$count{$file}{$_}++ if $line=~/$_/}@words;
}
}
close FILE;
} # End sub search
sub searchpage {
my ($cur_depth, $url) = @_;
my ($link, @links, $abs);
$verbose && print "Looking at $url, at depth $cur_depth\n";
$Docs{$url} = 1;
return(0) if ($cur_depth > $depth);
my $content = get($url);
if ($content=~m/$words/is) {
map{my $tmp=($content=~s/($_)/$1/gis);
$count{$url}{$_}=$tmp if $tmp}@words;
} # End if
$p->parse($content);
@links = $p->links;
for $link (@links) {
$abs = url($link->[2], $url)->abs if
($link->[0] eq 'a' && $link->[1] eq 'href');
$abs =~ s/#.*$//;
$abs =~ s!/$!!;
# Skip some URLs
next if $abs=~/^mailto/i;
next if $abs=~/(gz|zip|exe|tar|Z)$/;
next unless $abs;
next unless ($abs =~ /^$host/);
next if $abs=~/\?\S+?=\S+/;
searchpage($cur_depth+1, $abs) unless
($Docs{$abs} || ($cur_depth+1 > $depth));
}
} # End sub searchpage
sub Usage {
print <<EndUsage;
Usage: keywordsearch [-rfv] [-e suffix] [-k number] [-i ignorefile]
[-l log] [-o outfile] -d dir keywords
keywordsearch [-fv] [-k number] [-l log] [-o outfile]
[-i ignorefile] [-D depth] URL keywords
Options:
-d Directory to search
-e File extension to search (i.e. .txt .html)
Default behavior is to search only text-type files
-f Output the frequency of the words
-D Depth of search, or how many levels down to go
(Web search) (Default is 0)
-r Recurse subdirectories (File search)
-l Log file for problems found
-o Output file (Default is STDOUT)
-v Verbose
-H Restrict to the initial host (Web search)
EndUsage
exit(0);
}
=head1 NAME
keywordsearch - Searches a web site, or a local directory, for keyword(s)
=head1 DESCRIPTION
Simple command line tool for searching either a web site, or a local directory,
for documents that contain a particular word.
=head1 PREREQUISITES
Uses C<Getopt::Long>, C<LWP::Simple>, C<HTML::LinkExtor> and C<URI::URL>
=head1 COREQUISITES
None
=head1 README
Simple command-line tool for searching either a web site, or a local
directory, for documents containing particular keyword(s).
=pod OSNAMES
Any
=pod SCRIPT CATEGORIES
Search
=head1 Author
Written by Rich Bowen <
[email protected]> for The Creative Group
(<
http://www.cre8tivegroup.com>)
=cut