#!/usr/local/bin/perl
# go4check, v1.3
#
#-------------------------------------------------------------------------------
# Introduction
#   go4check checks gopher links, probing each connection and testing the
#   output received.  It handles most types of links, reporting if the link
#   is ok, the host serving it is down/refusing connections, or its pathname
#   has changed.  It is not 100% successful at this, especially when it
#   comes to gopher0 servers, but does indeed help you keep on top of links
#   in your server(s).
#
#   To run, go4check requires only perl and socket.ph.  It understands
#   gopher0 and gopher+ servers.
#
#   go4check produces a line of output on stdout for each item appearing
#   in a gopher's menu: the name of the item plus a result.  Indentation
#   serves to maintain items in context so problems can be located easily.
#   As an extra benefit, go4check's output can be used as a roadmap of
#   the gopher after some rather trivial editing to remove results.
#
#   go4check is written by George A. Theall, [email protected].
#   You may freely use and redistribute this.  I can not offer any
#   support for this but am interested in your comments, suggestions,
#   and problem reports.
#
#   The latest version is available via gopher as:
#      gopher://tjgopher.tju.edu/00/networks/internet/tools/gopher/go4check
#
#   Note: Version 1.3 will probably be the last version of go4check I release.
#
#-------------------------------------------------------------------------------
# Operation
#   Before you run go4check, make sure perl and the header file socket.ph are
#   available on your system. [You can generate this file by running the perl
#   utility h2ph on /usr/include/sys/socket.h, or something similar.]
#
#   Invoke go4check with the name of the server to check and an optional port
#   number.  Other options can be used to specify a non-standard starting
#   path or generate copious debugging info.  go4check will test the items
#   listed in the initial menu and recurse into any menus it finds as long
#   as the names of server it finds match the one specified at go4check's
#   invocation. go4check does, though, skip recursion if pathnames refer
#   to ftp gateways or point back to the initial entry point.
#
#   Results are directed to stdout, so you probably will want to redirect
#   to a file.  You might then remove instances of "...ok.", which
#   indicate no problems and finally search on "...can't connect.",
#   "...path changed.", and "...timed out.".  Another possible result
#   is "...n/a.", which is used when go4check doesn't know how to check
#   a particular type of link.
#
#   You may want to tune the variables that go4check uses for testing
#   items of type 2 and 7.  See below where initial values are defined.
#   For items of type 2, go4check sends a invalid command, which causes
#   many CSO servers to respond in a way that go4check interprets as a
#   success.  As for items of type 7, I don't know of any robust way
#   to test searches.  Currently, the best solution appears to be
#   to search for a word that's common to whatever searches are in the
#   gopher being checked.
#
#   go4check is slow; it probably belongs in a cron job to run at night.
#
#-------------------------------------------------------------------------------
# History
#   15-Mar-95, GAT, v1.3
#       - Added ability to avoid recursing into selected paths.  Paths are
#         tested using substr() so you can have go4check check a directory
#         but not recurse further by appending a "/", if that's what you want.
#
#   27-Feb-95, GAT, v1.2
#      - Moved alarm for connecting to within the tcpconnect subroutine
#        to handle better time-out conditions.
#      - Wrapped initial gopher connection and telnet check with alarms.
#
#   31-Jan-95, GAT, v1.1
#      - Alarms are now used to abort connections that are otherwise hung.
#      - Added patches from R.D. Cameron for supporting type 7 items with
#        non-empty paths and checking error returns of type 3.
#      - Fixed glitch that arose on some servers (gopher.uwsp.edu for one)
#        that return lines with non-standard endings.
#      - Explicitly added an assignment for $| and set it to true so output
#        will be flushed after every print.
#
#   17-Oct-94, GAT
#      - Added a semicolon after a line in make_URL.  Its lack appears to
#        cause problems with some versions of Perl.
#
#   01-Sep-94, GAT, v1.0
#      - Released publically.
#
#   10-Aug-94, GAT, v1.0b2
#      - Added $snooze_length as a way to control how long to pause after
#        establishing a connection.
#      - Fixed initialization of %URLs.
#      - Changed format of internal URLs by removing ":" from between type
#        and path info.
#      - Used a configurable word to check search items.
#      - Added check of CSO servers.
#      - Adjusted regular expression used to check success/failure of
#        a link.
#      - Documented go4check's operation.
#
#   12-Jul-94, GAT, v1.0b1
#      - Used pseudo URLs internally for storing links so they are not
#        checked more than once.
#      - Added support for most types of links, including telnet, binary
#        files, and searches.
#      - Used gopher+ protocol whenever possible to avoid retrieving
#        entire files.
#
#   09-Jun-94, GAT, v1.0a
#      - First version of go4check. Checks only files and directories.
#
#-------------------------------------------------------------------------------


# Specify where perl can find include files.
push(@INC, "/usr/local/lib/perl");


# Define initial values for selected variables.
$| = 1;                                 # flush after every print?
$default_path2 = "helo";                # for searching type 2 items
$default_search_term = "cancer";        # for searching type 7 items
@excluded_paths = (                     # paths to exclude
       "1/tjgopher/changes",
       "1/tju/atrium/",
       "1/tju/nutrition/atrium",
       "1/tju/jeffnews/current",
       "1/tju/jeffnews/backissues/",
       "1/tju/marketing/jeffnews",
       "1/gophers/bylocation/tju",
       "1/gophers/bylocation/philly");
$Indent = "  ";                         # indentation at each level
$snooze_length = 3;                     # time to snooze before connect
$timeout = 180;                         # max len of connect (seconds)
%URLs = ();                             # array of URL's on server


# Check for options.
$DEBUG = 0;                             # default to no debug
if ($ARGV[0] eq '-d') {
       shift;
       $DEBUG = 1;
}


# Parse commandline args and provide help as needed.
$inithost = shift || "";                # name of host to check
$initport = shift || 70;                # port number
$initpath = shift || "";                # initial directory
if ($inithost eq "" || $inithost eq "-?") {
       print "$0 checks links in a gopher by probing connections\n\n";
       print "Usage:  $0 [-d] host [port] [\"path\"]\n";
       print "        unless specified, port defaults to 70 and path to \"\".\n";
       print "        -d is used for debugging.\n";
       exit(9);
}


# Set up subroutines to catch some alarms.
$SIG{'ALRM'} = handle_Timeout;


# Establish connection and check links.
require 'sys/socket.ph';
chop($thishost = `hostname`);           # needed for tcpconnect
&check_Links($inithost, $initport, $initpath);
exit(0);


########################################################################
#  check_Links - checks links for a given directory.                   #
#                                                                      #
#  Notes:                                                              #
#      - Links on the same host will be followed unless they point to  #
#        the root.  While this will prevent most recursion, there may  #
#        be some gophers with odd setups that lead to infinite loops.  #
#      - FTP links are not followed.                                   #
#  Entry:                                                              #
#        host = hostname                                               #
#        port = port number                                            #
#        path = selector string                                        #
#  Exit:                                                               #
#        New links are appended to @URLs.                              #
########################################################################
sub check_Links {
       local($host, $port, $path) = @_;
       local($margin) = $Indent . $margin;
       local($stat);
       local(@Items);


       # Establish connection and read contents.
       $DEBUG && print "DEBUG: connecting to $host at port $port.\n";
       ($GOPHER) = &tcpconnect($host, $thishost);
       if ($@ && $@ =~ /Timed Out/) {
               die "$@";
       }
       ($GOPHER) || die "Can't connect";
       $DEBUG && print "DEBUG: sending path \"$path\".\n";
       eval {
               alarm($timeout);
               send($GOPHER, "$path\r\n", 0);
               @Items = <$GOPHER>;
               close($GOPHER);
               alarm(0);
       };
       if ($@ && $@ =~ /Timed Out/) {
               die "$@";
       }


       # Check each item, recursing into directories as necessary.
       foreach (@Items) {
               local($atype, $aname, $apath, $ahost, $aport, $aextra);

               s/\s*$//;               # remove \r\n combo
               last if (/^\.$/);       # done if line is just a period


               # Check status of each unique URL.
               $url = &make_URL($_);
               s/^(.)// && ($atype = $1);
               ($aname, $apath, $ahost, $aport, $aextra) = split(/\t/, $_);
               chop($ahost) if ($ahost =~ /\.$/);
               if (defined($URLs{$url})) {     # already checked
                       print "$margin$aname...$URLs{$url}.\n";
               }
               else {
                       $stat = ($URLs{$url} = &test_URL($url, $aextra));
                       print "$margin$aname...$stat.\n";
               }


               # Recurse as necessary.
               if ($stat eq "ok" &&
                               $atype == 1 &&
                               $ahost eq $inithost &&
                               $aport eq $initport &&
                               $apath ne "" &&
                               &is_Excluded($apath) == 0 &&
                               $apath !~ /ftp.*:/) {
                       &check_Links($ahost, $aport, $apath);
               }
       }
}


################################################
#  make_URL - constructs a URL from a string.  #
#                                              #
#  Notes:                                      #
#      - The URLs generated here are not 100%  #
#        kosher, only used internally.         #
#                                              #
#  Entry:                                      #
#        string as passed by gopher server.    #
#  Exit:                                       #
#        string representing URL.              #
################################################
sub make_URL {
       local($_) = @_;
       local($url);
       local($type, $name, $path, $host, $port);


       s/^(.)// && ($type = $1);
       ($name, $path, $host, $port) = split(/\t/, $_);
       chop($host) if ($host =~ /\.$/);
       if ($type =~ /[01245679sgMhIi]/) {
               $url = "gopher://$host:$port/$type$path";
       }
       elsif ($type =~ /[8T]/) {
               $url = "telnet://";
               $path !~ /^$/ && $url .= "$path@";
               $url .= $host;
               $port > 0 && $url .= ":$port";
               $url .= "/";
       }
       return($url);
}


###########################################################################
#  test_URL - check that a URL is accessible.                             #
#                                                                         #
#  Notes:                                                                 #
#      - I don't have a good way to check gopher0 servers.  Currently, I  #
#        look for the string "error.host", which servers like gn seem to  #
#        generate.  However, this fails with KA9Q, for which an error     #
#        message is indistinguishable from regular text.                  #
#      - For gopher+, a error code indicating a server is too busy is     #
#        treated as an error.  This may not be the right thing to do.     #
#      - If the server understands gopher+, we'll only ask for info (!)   #
#        so as not to retrieve large files.  This approach also seems to  #
#        be the only way to check ASK blocks reliably.                    #
#      - CSO nameservers (type 2) are checked with an invalid command -   #
#        this returns a warning message from the server that is not       #
#        regarded as an error by go4check. Using the command "fields"     #
#        does *not* work since this typically results in lines starting   #
#        with -2, which look like errors.                                 #
#      - Checks of telnet links only see if host is up; no attempt        #
#        is made to login to whatever account may be specified.           #
#      - Checks of FTP links could be improved.  Currently, the info      #
#        returned is not examined beyond looking for the usual signs      #
#        of failure.                                                      #
#  Entry:                                                                 #
#        URL = URL to test                                                #
#        GPLUS = extra character indicating a gopher+ item.               #
#  Exit:                                                                  #
#        Text string indicating status of URL:                            #
#           "ok" = everything ok                                          #
#           "can't connect" = can't connect to host                       #
#           "path changed" = path changed                                 #
#           "n/a" = unknown status                                        #
###########################################################################
sub test_URL {
       local($_, $gplus) = @_;
       local($protocol, $logonid, $host, $port, $type, $path);
       local($1, $2, $3, $4, $5);


       $DEBUG && print "DEBUG: checking $_.\n";
       m#^(\w+)://(.*):(\d+)/?(.?)(.*)#;
       $protocol = $1;
       $host = $2;
       $port = $3;
       $type = $4;
       $path = $5;
       if ($host =~ /@/) {
               ($logonid, $host) = split(/@/, $host);
       }
       $DEBUG && print "protocol=$protocol; logonid=$logonid; host=$host; port=$port; type=$type; path=$path.\n";


       # Check gopher links.
       if ($protocol eq "gopher") {
               local($GOPHER);
               local($Stuff);

               $DEBUG && print "DEBUG: checking gopher at $host;$port.\n";
               ($GOPHER) = &tcpconnect($host, $thishost);
               if ($@ && $@ =~ /Timed Out/) {
                       return "timed out";
               }
               ($GOPHER) || return "can't connect";
               $path .= "\t!" if ($gplus);     # Modify selector to get only info
               if ($type eq "2") {
                       $path = $default_path2 if ($path =~ /^$/);
               }
               elsif ($type eq "7") {
               # Modification Oct. 19/94 by R.D. Cameron to append
               # handle the nonempty $path case:  to test in this
               # case, we send a tab and the search term after the
               # $path.
                       if ($path =~ /^$/) {
                               $path = $default_search_term;
                       }
                       else {
                               $path = "$path\t$default_search_term";
                       }
                       $path =~ s#^waissrc:(.*)/.*$#1$1#;
               }
               $DEBUG && print "DEBUG: sending path \"$path\".\n";
               eval {
                       alarm($timeout);
                       send($GOPHER, "$path\r\n", 0);
                       $Stuff = <$GOPHER>;
                       close($GOPHER);
                       alarm(0);
               };
               if ($@ && $@ =~ /Timed Out/) {
                       return "timed out";
               }
               $DEBUG && print "DEBUG: read \"$Stuff\".\n";


               # Test line for signs of errors.
               #
               # Modification Oct. 19/94 by R.D. Cameron to
               # check for type 3 error returns when a directory
               # listing is expected.  (According to the gopher
               # protocol, "3" as the first character of a directory
               # entry always indicates error.
               if ((($type eq "1") | ($type eq "7")) &
                   ($Stuff =~ /^3/)) {
                       return("path changed");
               }
               # Test line for other signs of errors.
               elsif ($Stuff =~ /(^\-\-\d)|(\terror.host\t\d+)/) {
                       return("path changed");
               }
               else {
                       return("ok");
               }
       }


       # Check telnet links.
       if ($protocol eq "telnet") {
               local($TELNET);

               $DEBUG && print "DEBUG: checking telnet at $host;$port.\n";
               ($TELNET) = &tcpconnect($host, $thishost);
               if ($@ && $@ =~ /Timed Out/) {
                       return "timed out";
               }
               ($TELNET) || return "host down";
               return "ok";
               close($TELNET);
       }


       # If we get here, we don't know how to test the link.
       return("n/a");
}



#######################################################
#  is_Excluded - checks if a path is to be excluded.  #
#                                                     #
#  Entry:                                             #
#        path to be tested.                           #
#  Exit:                                              #
#        0/1 indicating no/yes.                       #
#######################################################
sub is_Excluded {
       local($path) = @_;


       for (@excluded_paths) {
               if (index($path, $_) >= $[) {
                       return(1);
               }
       }
       return(0);
}


################################################################
#  This comes from gopherhunt by Paul Lindner.                 #
#                                                              #
#  I've added a line to abort if it can't resolve an address.  #
#  and return 0 if failure rather than die. GAT                #
#                                                              #
#  I also added an alarm to handle time-out conditions. GAT    #
################################################################
sub tcpconnect {                    #Get TCP info in place
  local($host, $hostname) = @_;
  local($name, $aliases, $type, $len);
  local($thisaddr, $thataddr, $this, $that);
  local($sockaddr);
  $sockaddr = 'S n a4 x8';

  ($name,$aliases,$proto) = getprotobyname('tcp');
  ($name,$aliases,$port) = getservbyname($port, 'tcp')
       unless $port =~ /^\d+$/;
  ($name,$aliases,$type,$len,$thisaddr) = gethostbyname($hostname);
  ($name,$aliases,$type,$len,$thataddr) = gethostbyname($host);
  $name || return(0);

  $this = pack($sockaddr, &AF_INET, 0, $thisaddr);
  $that = pack($sockaddr, &AF_INET, $port, $thataddr);

  sleep($snooze_length);

  eval {
       alarm($timeout);
       socket(N, &PF_INET, &SOCK_STREAM, $proto) || return(0);
       bind(N, $this)                            || return(0);
       connect(N, $that)                         || return(0);
       alarm(0);
  };

  return(N);
}


#####################################################
#  handle_Timeout - Die with a specific message.    #
#                                                   #
#  Notes:                                           #
#        - Calls to alarm() should be in an eval    #
#          block.                                   #
#                                                   #
#  Entry:                                           #
#        n/a                                        #
#  Exit:                                            #
#        Message "Timed Out" is returned.           #
#####################################################
sub handle_Timeout {
       $DEBUG && print "DEBUG: Timed Out.\n";
       die "Timed Out";
}