#!/usr/local/bin/perl
# go4check, v1.3
#
#-------------------------------------------------------------------------------
# Introduction
# go4check checks gopher links, probing each connection and testing the
# output received. It handles most types of links, reporting if the link
# is ok, the host serving it is down/refusing connections, or its pathname
# has changed. It is not 100% successful at this, especially when it
# comes to gopher0 servers, but does indeed help you keep on top of links
# in your server(s).
#
# To run, go4check requires only perl and socket.ph. It understands
# gopher0 and gopher+ servers.
#
# go4check produces a line of output on stdout for each item appearing
# in a gopher's menu: the name of the item plus a result. Indentation
# serves to maintain items in context so problems can be located easily.
# As an extra benefit, go4check's output can be used as a roadmap of
# the gopher after some rather trivial editing to remove results.
#
# go4check is written by George A. Theall,
[email protected].
# You may freely use and redistribute this. I can not offer any
# support for this but am interested in your comments, suggestions,
# and problem reports.
#
# The latest version is available via gopher as:
#
gopher://tjgopher.tju.edu/00/networks/internet/tools/gopher/go4check
#
# Note: Version 1.3 will probably be the last version of go4check I release.
#
#-------------------------------------------------------------------------------
# Operation
# Before you run go4check, make sure perl and the header file socket.ph are
# available on your system. [You can generate this file by running the perl
# utility h2ph on /usr/include/sys/socket.h, or something similar.]
#
# Invoke go4check with the name of the server to check and an optional port
# number. Other options can be used to specify a non-standard starting
# path or generate copious debugging info. go4check will test the items
# listed in the initial menu and recurse into any menus it finds as long
# as the names of server it finds match the one specified at go4check's
# invocation. go4check does, though, skip recursion if pathnames refer
# to ftp gateways or point back to the initial entry point.
#
# Results are directed to stdout, so you probably will want to redirect
# to a file. You might then remove instances of "...ok.", which
# indicate no problems and finally search on "...can't connect.",
# "...path changed.", and "...timed out.". Another possible result
# is "...n/a.", which is used when go4check doesn't know how to check
# a particular type of link.
#
# You may want to tune the variables that go4check uses for testing
# items of type 2 and 7. See below where initial values are defined.
# For items of type 2, go4check sends a invalid command, which causes
# many CSO servers to respond in a way that go4check interprets as a
# success. As for items of type 7, I don't know of any robust way
# to test searches. Currently, the best solution appears to be
# to search for a word that's common to whatever searches are in the
# gopher being checked.
#
# go4check is slow; it probably belongs in a cron job to run at night.
#
#-------------------------------------------------------------------------------
# History
# 15-Mar-95, GAT, v1.3
# - Added ability to avoid recursing into selected paths. Paths are
# tested using substr() so you can have go4check check a directory
# but not recurse further by appending a "/", if that's what you want.
#
# 27-Feb-95, GAT, v1.2
# - Moved alarm for connecting to within the tcpconnect subroutine
# to handle better time-out conditions.
# - Wrapped initial gopher connection and telnet check with alarms.
#
# 31-Jan-95, GAT, v1.1
# - Alarms are now used to abort connections that are otherwise hung.
# - Added patches from R.D. Cameron for supporting type 7 items with
# non-empty paths and checking error returns of type 3.
# - Fixed glitch that arose on some servers (gopher.uwsp.edu for one)
# that return lines with non-standard endings.
# - Explicitly added an assignment for $| and set it to true so output
# will be flushed after every print.
#
# 17-Oct-94, GAT
# - Added a semicolon after a line in make_URL. Its lack appears to
# cause problems with some versions of Perl.
#
# 01-Sep-94, GAT, v1.0
# - Released publically.
#
# 10-Aug-94, GAT, v1.0b2
# - Added $snooze_length as a way to control how long to pause after
# establishing a connection.
# - Fixed initialization of %URLs.
# - Changed format of internal URLs by removing ":" from between type
# and path info.
# - Used a configurable word to check search items.
# - Added check of CSO servers.
# - Adjusted regular expression used to check success/failure of
# a link.
# - Documented go4check's operation.
#
# 12-Jul-94, GAT, v1.0b1
# - Used pseudo URLs internally for storing links so they are not
# checked more than once.
# - Added support for most types of links, including telnet, binary
# files, and searches.
# - Used gopher+ protocol whenever possible to avoid retrieving
# entire files.
#
# 09-Jun-94, GAT, v1.0a
# - First version of go4check. Checks only files and directories.
#
#-------------------------------------------------------------------------------
# Specify where perl can find include files.
push(@INC, "/usr/local/lib/perl");
# Define initial values for selected variables.
$| = 1; # flush after every print?
$default_path2 = "helo"; # for searching type 2 items
$default_search_term = "cancer"; # for searching type 7 items
@excluded_paths = ( # paths to exclude
"1/tjgopher/changes",
"1/tju/atrium/",
"1/tju/nutrition/atrium",
"1/tju/jeffnews/current",
"1/tju/jeffnews/backissues/",
"1/tju/marketing/jeffnews",
"1/gophers/bylocation/tju",
"1/gophers/bylocation/philly");
$Indent = " "; # indentation at each level
$snooze_length = 3; # time to snooze before connect
$timeout = 180; # max len of connect (seconds)
%URLs = (); # array of URL's on server
# Check for options.
$DEBUG = 0; # default to no debug
if ($ARGV[0] eq '-d') {
shift;
$DEBUG = 1;
}
# Parse commandline args and provide help as needed.
$inithost = shift || ""; # name of host to check
$initport = shift || 70; # port number
$initpath = shift || ""; # initial directory
if ($inithost eq "" || $inithost eq "-?") {
print "$0 checks links in a gopher by probing connections\n\n";
print "Usage: $0 [-d] host [port] [\"path\"]\n";
print " unless specified, port defaults to 70 and path to \"\".\n";
print " -d is used for debugging.\n";
exit(9);
}
# Set up subroutines to catch some alarms.
$SIG{'ALRM'} = handle_Timeout;
# Establish connection and check links.
require 'sys/socket.ph';
chop($thishost = `hostname`); # needed for tcpconnect
&check_Links($inithost, $initport, $initpath);
exit(0);
########################################################################
# check_Links - checks links for a given directory. #
# #
# Notes: #
# - Links on the same host will be followed unless they point to #
# the root. While this will prevent most recursion, there may #
# be some gophers with odd setups that lead to infinite loops. #
# - FTP links are not followed. #
# Entry: #
# host = hostname #
# port = port number #
# path = selector string #
# Exit: #
# New links are appended to @URLs. #
########################################################################
sub check_Links {
local($host, $port, $path) = @_;
local($margin) = $Indent . $margin;
local($stat);
local(@Items);
# Establish connection and read contents.
$DEBUG && print "DEBUG: connecting to $host at port $port.\n";
($GOPHER) = &tcpconnect($host, $thishost);
if ($@ && $@ =~ /Timed Out/) {
die "$@";
}
($GOPHER) || die "Can't connect";
$DEBUG && print "DEBUG: sending path \"$path\".\n";
eval {
alarm($timeout);
send($GOPHER, "$path\r\n", 0);
@Items = <$GOPHER>;
close($GOPHER);
alarm(0);
};
if ($@ && $@ =~ /Timed Out/) {
die "$@";
}
# Check each item, recursing into directories as necessary.
foreach (@Items) {
local($atype, $aname, $apath, $ahost, $aport, $aextra);
s/\s*$//; # remove \r\n combo
last if (/^\.$/); # done if line is just a period
# Check status of each unique URL.
$url = &make_URL($_);
s/^(.)// && ($atype = $1);
($aname, $apath, $ahost, $aport, $aextra) = split(/\t/, $_);
chop($ahost) if ($ahost =~ /\.$/);
if (defined($URLs{$url})) { # already checked
print "$margin$aname...$URLs{$url}.\n";
}
else {
$stat = ($URLs{$url} = &test_URL($url, $aextra));
print "$margin$aname...$stat.\n";
}
# Recurse as necessary.
if ($stat eq "ok" &&
$atype == 1 &&
$ahost eq $inithost &&
$aport eq $initport &&
$apath ne "" &&
&is_Excluded($apath) == 0 &&
$apath !~ /ftp.*:/) {
&check_Links($ahost, $aport, $apath);
}
}
}
################################################
# make_URL - constructs a URL from a string. #
# #
# Notes: #
# - The URLs generated here are not 100% #
# kosher, only used internally. #
# #
# Entry: #
# string as passed by gopher server. #
# Exit: #
# string representing URL. #
################################################
sub make_URL {
local($_) = @_;
local($url);
local($type, $name, $path, $host, $port);
s/^(.)// && ($type = $1);
($name, $path, $host, $port) = split(/\t/, $_);
chop($host) if ($host =~ /\.$/);
if ($type =~ /[01245679sgMhIi]/) {
$url = "gopher://$host:$port/$type$path";
}
elsif ($type =~ /[8T]/) {
$url = "telnet://";
$path !~ /^$/ && $url .= "$path@";
$url .= $host;
$port > 0 && $url .= ":$port";
$url .= "/";
}
return($url);
}
###########################################################################
# test_URL - check that a URL is accessible. #
# #
# Notes: #
# - I don't have a good way to check gopher0 servers. Currently, I #
# look for the string "error.host", which servers like gn seem to #
# generate. However, this fails with KA9Q, for which an error #
# message is indistinguishable from regular text. #
# - For gopher+, a error code indicating a server is too busy is #
# treated as an error. This may not be the right thing to do. #
# - If the server understands gopher+, we'll only ask for info (!) #
# so as not to retrieve large files. This approach also seems to #
# be the only way to check ASK blocks reliably. #
# - CSO nameservers (type 2) are checked with an invalid command - #
# this returns a warning message from the server that is not #
# regarded as an error by go4check. Using the command "fields" #
# does *not* work since this typically results in lines starting #
# with -2, which look like errors. #
# - Checks of telnet links only see if host is up; no attempt #
# is made to login to whatever account may be specified. #
# - Checks of FTP links could be improved. Currently, the info #
# returned is not examined beyond looking for the usual signs #
# of failure. #
# Entry: #
# URL = URL to test #
# GPLUS = extra character indicating a gopher+ item. #
# Exit: #
# Text string indicating status of URL: #
# "ok" = everything ok #
# "can't connect" = can't connect to host #
# "path changed" = path changed #
# "n/a" = unknown status #
###########################################################################
sub test_URL {
local($_, $gplus) = @_;
local($protocol, $logonid, $host, $port, $type, $path);
local($1, $2, $3, $4, $5);
$DEBUG && print "DEBUG: checking $_.\n";
m#^(\w+)://(.*):(\d+)/?(.?)(.*)#;
$protocol = $1;
$host = $2;
$port = $3;
$type = $4;
$path = $5;
if ($host =~ /@/) {
($logonid, $host) = split(/@/, $host);
}
$DEBUG && print "protocol=$protocol; logonid=$logonid; host=$host; port=$port; type=$type; path=$path.\n";
# Check gopher links.
if ($protocol eq "gopher") {
local($GOPHER);
local($Stuff);
$DEBUG && print "DEBUG: checking gopher at $host;$port.\n";
($GOPHER) = &tcpconnect($host, $thishost);
if ($@ && $@ =~ /Timed Out/) {
return "timed out";
}
($GOPHER) || return "can't connect";
$path .= "\t!" if ($gplus); # Modify selector to get only info
if ($type eq "2") {
$path = $default_path2 if ($path =~ /^$/);
}
elsif ($type eq "7") {
# Modification Oct. 19/94 by R.D. Cameron to append
# handle the nonempty $path case: to test in this
# case, we send a tab and the search term after the
# $path.
if ($path =~ /^$/) {
$path = $default_search_term;
}
else {
$path = "$path\t$default_search_term";
}
$path =~ s#^waissrc:(.*)/.*$#1$1#;
}
$DEBUG && print "DEBUG: sending path \"$path\".\n";
eval {
alarm($timeout);
send($GOPHER, "$path\r\n", 0);
$Stuff = <$GOPHER>;
close($GOPHER);
alarm(0);
};
if ($@ && $@ =~ /Timed Out/) {
return "timed out";
}
$DEBUG && print "DEBUG: read \"$Stuff\".\n";
# Test line for signs of errors.
#
# Modification Oct. 19/94 by R.D. Cameron to
# check for type 3 error returns when a directory
# listing is expected. (According to the gopher
# protocol, "3" as the first character of a directory
# entry always indicates error.
if ((($type eq "1") | ($type eq "7")) &
($Stuff =~ /^3/)) {
return("path changed");
}
# Test line for other signs of errors.
elsif ($Stuff =~ /(^\-\-\d)|(\terror.host\t\d+)/) {
return("path changed");
}
else {
return("ok");
}
}
# Check telnet links.
if ($protocol eq "telnet") {
local($TELNET);
$DEBUG && print "DEBUG: checking telnet at $host;$port.\n";
($TELNET) = &tcpconnect($host, $thishost);
if ($@ && $@ =~ /Timed Out/) {
return "timed out";
}
($TELNET) || return "host down";
return "ok";
close($TELNET);
}
# If we get here, we don't know how to test the link.
return("n/a");
}
#######################################################
# is_Excluded - checks if a path is to be excluded. #
# #
# Entry: #
# path to be tested. #
# Exit: #
# 0/1 indicating no/yes. #
#######################################################
sub is_Excluded {
local($path) = @_;
for (@excluded_paths) {
if (index($path, $_) >= $[) {
return(1);
}
}
return(0);
}
################################################################
# This comes from gopherhunt by Paul Lindner. #
# #
# I've added a line to abort if it can't resolve an address. #
# and return 0 if failure rather than die. GAT #
# #
# I also added an alarm to handle time-out conditions. GAT #
################################################################
sub tcpconnect { #Get TCP info in place
local($host, $hostname) = @_;
local($name, $aliases, $type, $len);
local($thisaddr, $thataddr, $this, $that);
local($sockaddr);
$sockaddr = 'S n a4 x8';
($name,$aliases,$proto) = getprotobyname('tcp');
($name,$aliases,$port) = getservbyname($port, 'tcp')
unless $port =~ /^\d+$/;
($name,$aliases,$type,$len,$thisaddr) = gethostbyname($hostname);
($name,$aliases,$type,$len,$thataddr) = gethostbyname($host);
$name || return(0);
$this = pack($sockaddr, &AF_INET, 0, $thisaddr);
$that = pack($sockaddr, &AF_INET, $port, $thataddr);
sleep($snooze_length);
eval {
alarm($timeout);
socket(N, &PF_INET, &SOCK_STREAM, $proto) || return(0);
bind(N, $this) || return(0);
connect(N, $that) || return(0);
alarm(0);
};
return(N);
}
#####################################################
# handle_Timeout - Die with a specific message. #
# #
# Notes: #
# - Calls to alarm() should be in an eval #
# block. #
# #
# Entry: #
# n/a #
# Exit: #
# Message "Timed Out" is returned. #
#####################################################
sub handle_Timeout {
$DEBUG && print "DEBUG: Timed Out.\n";
die "Timed Out";
}