#!/usr/bin/perl
#
http://www.cs.dal.ca/~vlado/srcperl/report-new.pl
# Copyright 2000-7 Vlado Keselj www.cs.dal.ca/~vlado
sub help { print STDERR <<"#EOT" }
# Report new material on a web page, version $VERSION
#
# The program normally runs as a cron job, so the result is sent as an
# e-mail if it something changes. It is convenient for keeping a
# watch on interesting web pages. It uses diff, lynx or wget,
# sendmail (if option -e is used)
#
# Usage: report-new.pl [switches] URL
# -h Print help and exit.
# -v Print version of the program and exit.
# -e email Sends output, if not empty, to email.
# -D dir Working directory. By default, ${HOME}/agn is used if
# exists, otherwise the current directory.
# -d method Dumping method. The default is lynx_txt; the other
# option is wget.
# -g Debugging mode. The new file is not fetched, but the program
# reruns on the last two different versions of the page.
# -p procf Input processor. There is supposed to be a file named
# procf that can be 'require'-ed, in Perl sense, which defines
# a subroutine named procf, which is used to filter the file
# before making diff. The escape string '!KEEP: ' can be left
# at the beginning of the lines which we want to appear in diff
# whenever there is a difference.
# More documentation included close to the end of the source file.
#EOT
use strict vars;
use POSIX qw(strftime);
use Carp;
use vars qw( $VERSION );
$VERSION = sprintf "%d.%d", q$Revision: 1.16 $ =~ /(\d+)/g;
use Getopt::Std;
use vars qw($opt_v $opt_h $opt_e $opt_d $opt_D $opt_p $opt_g);
getopts("hve:D:d:p:g");
if ($opt_v) { print "$VERSION\n"; exit; }
elsif ($opt_h || !@ARGV) { &help(); exit; }
my $dump = \&dump_lynx_txt;
if ($opt_d ne '') {
if ($opt_d eq 'lynx_txt') { $dump = \&dump_lynx_txt }
elsif($opt_d eq 'wget' ) { $dump = \&dump_wget }
else { &help(); print STDERR "error: '-d $opt_d'"; exit -1; }
}
if ($opt_D eq '') {
if (-d $ENV{HOME}."/agn") { $opt_D = $ENV{HOME}."/agn" }
else { $opt_D = '.' }
} elsif (! -d $opt_D ) { die "directory \"$opt_D\" does not exist" }
($#ARGV==0 && $ARGV[0]=~/^http:\/\//) ||
die "Format: report-new.pl http://...\n";
my ($urlbase, $url);
$urlbase = $url = shift; # E.g.:
http://www.cs.dal.ca/~vlado/srcperl
if ( $url =~ m.//[^/]*/. )
{ $urlbase = $`.$& } # E.g.:
http://www.cs.dal.ca/
my $urlId = &encode_w1($url);
my $timestamp = strftime("%Y-%m-%d-%T", localtime(time));
if (! -d "$opt_D/tmp")
{ mkdir "$opt_D/tmp", 0700 or die "can't mkdir $opt_D/tmp: $!" }
if (! -d "$opt_D/report-new.pl.d")
{ mkdir "$opt_D/report-new.pl.d", 0700 or die "can't mkdir $opt_D/report-new.pl.d: $!" }
chdir $opt_D;
my $TmpBase = "$opt_D/tmp/$urlId-$timestamp";
my $TmpFile1 = "$TmpBase-1";
my $TmpFile2 = "$TmpBase-2";
my $lastFile = "$opt_D/report-new.pl.d/$urlId.last";
-e $lastFile or putfile($lastFile,'');
my $lastFile1 = "$opt_D/report-new.pl.d/$urlId.last-1";
-e $lastFile1 or putfile($lastFile1,'');
# First step: fetch the page, unless option -g is given
if (! $opt_g ) {
&$dump($url, $TmpFile1);
my $f1 = getfile($lastFile);
my $f2 = getfile($TmpFile1);
if ($f1 eq $f2) { rename($TmpFile1, $lastFile); &_exit(0); } # nothing changed, exit
rename($lastFile, $lastFile1);
rename($TmpFile1, $lastFile);
}
my $material = getfile($lastFile);
my $material1 = getfile($lastFile1);
if ($opt_p) {
require $opt_p;
$material = &$opt_p($material);
$material1 = &$opt_p($material1);
}
putfile($TmpFile1, $material);
putfile($TmpFile2, $material1);
my $diffres = `diff $TmpFile1 $TmpFile2 2>&1`;
$diffres =~ s/^[^<].*\n//mg;
$diffres =~ s/^< //mg;
if ($diffres ne '') {
if ($opt_p && index($material, '!KEEP: ') > -1) {
$material =~ s/^!KEEP: //mg;
putfile($TmpFile1, $material);
$diffres = `diff $TmpFile1 $TmpFile2 2>&1`;
$diffres =~ s/^[^<].*\n//mg;
$diffres =~ s/^< //mg;
}
if ($opt_e) {
my $out;
open($out, "|sendmail -t") or die;
print $out "To: $opt_e\n".
"Subject: [report-new.pl] $url\n\n$diffres";
close($out);
}
else { print $diffres }
}
&_exit(0);
sub _exit {
my $r = shift;
unlink $TmpFile1 if -e $TmpFile1;
unlink $TmpFile2 if -e $TmpFile2;
exit $r;
}
sub putfile($@) {
my $f = shift;
local *F;
open(F, ">$f") or die "putfile:cannot open $f:$!";
print F '' unless @_;
while (@_) { print F shift(@_) }
close(F)
}
sub encode_w1( $ ) {
local $_ = shift;
s/[\W_]/'_'.uc unpack("H2",$&)/ge;
return $_;
}
sub dump_lynx_txt {
my $url = shift;
my $file = shift;
local *F;
open(F,"|lynx -dump -nolist - > \Q$file") or die "lynx error:$!";
print F $url;
close(F);
}
sub dump_wget {
my $url = shift;
my $file = shift;
system('wget', '--quiet', '-O', $file, $url);
}
sub getfile($) {
my $f = shift;
local *F;
open(F, "<$f") or croak "getfile:cannot open $f:$!";
my @r = <F>;
close(F);
return wantarray ? @r : join ('', @r);
}
__END__
=head1 NAME
report-new.pl - Report new material on a web page
=head1 SYNOPIS
report-new.pl [switches] URL
=head1 DESCRIPTION
Reports new material on a web page. Typically used as a cron job.
-h Print help and exit.
-v Print version of the program and exit.
-e email Sends output, if not empty, to email.
-D dir Working directory. By default, ${HOME}/agn is used if
exists, otherwise the current directory.
-d method Dumping method. The default is lynx_txt; the other
option is wget.
-p procf Input processor. There is supposed to be a file named
procf that can be 'require'-ed, which defines a subroutine
named procf, which is used to filter the file before making
diff. The escape string '!KEEP: ' can be left at the
beginning of the lines which we want to appear in diff
whenever there is a difference.
=head1 PREREQUISITES
POSIX qw(strftime);
uses diff, lynx or wget, sendmail (if option -e is used).
=head1 SCRIPT CATEGORIES
Web
=head1 README
Reports new material on a web page.
=head1 SEE ALSO
Scripts:
wget
=head1 THANKS
I would like to thank Peet Moris for bug reports and comments.
=head1 COPYRIGHT
Copyright 2000-7 Vlado Keselj F<
http://www.cs.dal.ca/~vlado>
This script is provided "as is" without expressed or implied warranty.
This is free software; you can redistribute it and/or modify it under
the same terms as Perl itself.
The latest version can be found at F<
http://www.cs.dal.ca/~vlado/srcperl/>.
=cut