# DESCRIPTION:

#!/usr/local/bin/perl

# DESCRIPTION:
# The popular log analysis tool Analog includes a link to the HTML
# validation tool at http://www.webtechs.com/html-val-svc/ in its output
# files. The domain webtechs.com has apparently been taken over (legally
# or not) by a porn site. Until that is resolved, there are a lot of pages
# on the web that are unintentionally linking to a porn site. If you make
# your stat pages available to the public, this obviously could cause you
# much embarrassment.
#
# This script simply opens the HTML files you specify and removes the link
# to WebTechs. It can process all files in a given directory and optionally
# any sub-directories (see the OPTIONS section below).
#
# USAGE:
# nowebtechs.pl item1 [item2] [item3] [etc]
#
# List the filename(s) and/or directory name(s) on the command line that
# you want the script to process. Separate multiple files or directories
# with a space. The wild-card character for matching multiple files is NOT
# supported. You must list each file separately or list a directory name
# to process every file in that directory.
#
# This script must be run from the shell...not via the web as a CGI script!
# If you are not able to run it, make sure the path to perl on the first
# line is correct (type "which perl" at the command line). Also, make sure
# it is executable (type "chmod 700 nowebtechs.pl" at the command line).
#
# NOTE: This script can be customized for your needs. See the OPTIONS
# section below.
#
# DISCLAIMER:
# There are no guarantees or warranties with this script. It works
# great for me, but your mileage may vary. USE IT AT YOUR OWN RISK.
#
# AUTHOR:
# Sam Choukri
# [email protected]
#
# Version: 1.0
# The latest version of this script can be found at:
# http://www.bagism.com/freeware/nowebtechs.txt
#
# Copyright (c) 1999 Sam Choukri

# OPTIONS:
$keep_backups = 1;
# For safety, this script makes backups of your HTML files before
# processing them. If you want to keep the backups, enter "1" for
# this value. If you want the backup files to be automatically deleted
# after each file has been successfully processed, enter "0" (zero).

$process_sub_dirs = 1;
# If you want the script to process the files in any sub-directories
# within the starting directory you specify, enter "1" for this value.
# If you want it to process only the starting directory, enter "0" (zero).

$display_messages = 1;
# If you want the script to display messages on the screen telling
# you what files it is working on, whether or not the WebTechs link was
# found, and any errors which occur, enter "1" for this value. If you
# don't want to display any messages (helpful if you are running this
# script as a cron job), enter "0" (zero).

@permitted_file_extensions = qw(html htm shtml);
# This is the list of file extensions that the script is allowed to
# process. The case of the extension does NOT matter.
# Do NOT add the "." to the extension...that will be added implicitly.

############################################################################

use File::Copy;

$ext = join ('|', @permitted_file_extensions);

foreach (@ARGV) {
unless (-e $_) {
&Message("**Skipped $_: non-existant file or directory\n");
}
&Shunt($_);
}

sub Shunt {
local($item) = @_;

next if (-l $item); # skip symbolic links
if (-d $item) {
#VERY IMPORTANT: prevents re-reading current and prior directories
next if $item =~ /^\./;
&OpenDir($item);
} elsif (m/\.($ext)$/oi) {
&OpenFile($item);
}
}

sub OpenDir {
local($dir) = @_;
local(@contents) = ();

&Message("Opening directory $dir...\n");

opendir (DIR, "$dir") or &Message("**Can't open the directory $dir: $!\n");
@contents = grep(!/^\./, readdir (DIR));
closedir DIR;

$dir =~ s|/*$||;
foreach (@contents) {
s|^(.*)$|$dir/$1|;
if (-d $_ && !$process_sub_dirs) {
&Message("--Skipping sub-directory $dir\n");
next;
}
&Shunt($_);
}
}

sub OpenFile {
local($file) = @_;
local($text) = '';

&Message("--Processing FILE $file...\n");

if ( copy($file,"$file.bak") ) {

$found_link = 0;
open (INFILE, "$file.bak") or &Message("**Can't read file $file.bak: $!\n");
open (OUTFILE, ">$file") or &Message("**Can't write to file $file: $!\n");
while (<INFILE>) {
# If we find the beginning of the webtechs link, let's
# skip the rest of the lines and just print the closing
# body and html tags instead.
if (m|^<P> <A HREF="http://www.webtechs.com/html-val-svc/">|i) {
print OUTFILE "\n";
print OUTFILE "</body>\n</html>\n";
&Message("----Removed Webtechs link in $file\n");
$found_link = 1;
last;
} else {
print OUTFILE;
}
}
$found_link or &Message("----WebTechs link not found in $file\n");
close (INFILE) or &Message("**Can't close file $file.bak: $!\n");
close (OUTFILE) or &Message("Can't close file $file: $!\n");

unless ($keep_backups) {
unlink "$file.bak" or &Message("**Can't delete backup file $file.bak: $!\n");
}

} else {
&Message("**Can't create a backup file for $file (skipped): $!\n");
}
}

sub Message {
local($message) = @_;
print $message if $display_messages;
}