#!/usr/local/bin/perl



# DESCRIPTION:
#   The popular log analysis tool Analog includes a link to the HTML
#   validation tool at http://www.webtechs.com/html-val-svc/ in its output
#   files. The domain webtechs.com has apparently been taken over (legally
#   or not) by a porn site. Until that is resolved, there are a lot of pages
#   on the web that are unintentionally linking to a porn site. If you make
#   your stat pages available to the public, this obviously could cause you
#   much embarrassment.
#
#   This script simply opens the HTML files you specify and removes the link
#   to WebTechs. It can process all files in a given directory and optionally
#   any sub-directories (see the OPTIONS section below).
#
# USAGE:
#   nowebtechs.pl item1 [item2] [item3] [etc]
#
#   List the filename(s) and/or directory name(s) on the command line that
#   you want the script to process. Separate multiple files or directories
#   with a space. The wild-card character for matching multiple files is NOT
#   supported. You must list each file separately or list a directory name
#   to process every file in that directory.
#
#   This script must be run from the shell...not via the web as a CGI script!
#   If you are not able to run it, make sure the path to perl on the first
#   line is correct (type "which perl" at the command line). Also, make sure
#   it is executable (type "chmod 700 nowebtechs.pl" at the command line).
#
#   NOTE: This script can be customized for your needs. See the OPTIONS
#   section below.
#
# DISCLAIMER:
#    There are no guarantees or warranties with this script. It works
#    great for me, but your mileage may vary. USE IT AT YOUR OWN RISK.
#
# AUTHOR:
#    Sam Choukri
#    [email protected]
#
#    Version: 1.0
#    The latest version of this script can be found at:
#    http://www.bagism.com/freeware/nowebtechs.txt
#
#    Copyright (c) 1999 Sam Choukri


# OPTIONS:
$keep_backups = 1;
  # For safety, this script makes backups of your HTML files before
  # processing them. If you want to keep the backups, enter "1" for
  # this value. If you want the backup files to be automatically deleted
  # after each file has been successfully processed, enter "0" (zero).

$process_sub_dirs = 1;
  # If you want the script to process the files in any sub-directories
  # within the starting directory you specify, enter "1" for this value.
  # If you want it to process only the starting directory, enter "0" (zero).

$display_messages = 1;
  # If you want the script to display messages on the screen telling
  # you what files it is working on, whether or not the WebTechs link was
  # found, and any errors which occur, enter "1" for this value. If you
  # don't want to display any messages (helpful if you are running this
  # script as a cron job), enter "0" (zero).

@permitted_file_extensions = qw(html htm shtml);
  # This is the list of file extensions that the script is allowed to
  # process. The case of the extension does NOT matter.
  # Do NOT add the "." to the extension...that will be added implicitly.


############################################################################

use File::Copy;

$ext = join ('|', @permitted_file_extensions);

foreach (@ARGV) {
  unless (-e $_) {
     &Message("**Skipped $_: non-existant file or directory\n");
  }
  &Shunt($_);
}

sub Shunt {
  local($item) = @_;

  next if (-l $item); # skip symbolic links
  if (-d $item) {
     #VERY IMPORTANT: prevents re-reading current and prior directories
     next if $item =~ /^\./;
     &OpenDir($item);
  } elsif (m/\.($ext)$/oi) {
     &OpenFile($item);
  }
}


sub OpenDir {
  local($dir) = @_;
  local(@contents) = ();

  &Message("Opening directory $dir...\n");

  opendir (DIR, "$dir") or &Message("**Can't open the directory $dir: $!\n");
  @contents = grep(!/^\./, readdir (DIR));
  closedir DIR;

  $dir =~ s|/*$||;
  foreach (@contents) {
     s|^(.*)$|$dir/$1|;
     if (-d $_ && !$process_sub_dirs) {
        &Message("--Skipping sub-directory $dir\n");
        next;
     }
     &Shunt($_);
  }
}

sub OpenFile {
  local($file) = @_;
  local($text) = '';

  &Message("--Processing FILE $file...\n");

  if ( copy($file,"$file.bak") ) {

     $found_link = 0;
     open (INFILE, "$file.bak") or &Message("**Can't read file $file.bak: $!\n");
     open (OUTFILE, ">$file") or &Message("**Can't write to file $file: $!\n");
     while (<INFILE>) {
        # If we find the beginning of the webtechs link, let's
        # skip the rest of the lines and just print the closing
        # body and html tags instead.
        if (m|^<P> <A HREF="http://www.webtechs.com/html-val-svc/">|i) {
           print OUTFILE "<!--removed WebTechs link-->\n";
           print OUTFILE "</body>\n</html>\n";
           &Message("----Removed Webtechs link in $file\n");
           $found_link = 1;
           last;
        } else {
           print OUTFILE;
        }
     }
     $found_link or &Message("----WebTechs link not found in $file\n");
     close (INFILE) or &Message("**Can't close file $file.bak: $!\n");
     close (OUTFILE) or &Message("Can't close file $file: $!\n");

     unless ($keep_backups) {
        unlink "$file.bak" or &Message("**Can't delete backup file $file.bak: $!\n");
     }

  } else {
     &Message("**Can't create a backup file for $file (skipped): $!\n");
  }
}

sub Message {
  local($message) = @_;
  print $message if $display_messages;
}