Article 1597 of comp.infosystems.www:
Xref: feenix.metronet.com comp.infosystems.www:1597
Path: feenix.metronet.com!news.ecn.bgu.edu!usenet.ins.cwru.edu!howland.reston.ans.net!math.ohio-state.edu!cs.utexas.edu!geraldo.cc.utexas.edu!ansel.cc.utexas.edu!not-for-mail
From: [email protected] (Jack Lund)
Newsgroups: comp.infosystems.www
Subject: Re: How to get gopher files with perl?
Date: 14 Sep 1993 13:42:50 -0500
Organization: The University of Texas - Austin
Lines: 711
Message-ID: <[email protected]>
References: <[email protected]>
NNTP-Posting-Host: ansel.cc.utexas.edu

In article <[email protected]>,
Oscar Nierstrasz <[email protected]> wrote:
>
>I want to get a large number of gopher files using a shell/perl script
>(actually I don't want to -- Simon Gibbs next door wants to).
>
>The gopher protocol looked simple, so I thought it would be easy.
>It seems to work, but the files returned are short if they are image files (GIF).
>What am I doing wrong?  Are there any good pointers to on-line doc?
>(I have seen the gopher protocol.rtf at cern, but it seems to be
>somewhat out of date ...)
>
>I thought all you had to do was connect, send the name of the file you
>want, and gobble up the reply.  That doesn't seem to work.
>Are image files compressed or otherwise encoded?
>
>Here is the perl subroutine I am using:

I modified a perl script of yours (remember hget?) to take (just
about) any URL and grab the appropriate document. It works well for
binary & ascii files. You might check it out.

I haven't had a chance to take a close look at your code, but if I
discover anything, I'll let you know.

-------------------------url_get------------------------------------
#!/bin/perl
#
# url_get      --- get a document given a WWW URL
#
# Jack Lund 9/3/93 [email protected]
#
# from hget by:
# Oscar Nierstrasz 26/8/93 [email protected]
#

$home = $ENV{"HOME"};

require "chat2.pl";
require "$home/lib/perl/URL.pl";
require "$home/lib/perl/ftplib.pl";
require "getopts.pl";
&Getopts(':b');

die "Usage: url_get <http-url> ...\n" unless $#ARGV >= 0;
$timeout = undef;

foreach $url (@ARGV) {
       ($protocol, $host, $port, $rest1, $rest2, $rest3) = &url'parse_url($url);
       foo:
   {
       if ($protocol eq "http") {
           &http_get($host,$port,$rest1); last foo;
       }
       if ($protocol eq "gopher") {
           &gopher_get($host, $port, $rest1, $rest2, $rest3); last foo;
       }
       if ($protocol eq "file") {
           &file_get($host, $port, $rest1); last foo;
       }
       if ($protocol eq "news") {
           &news_get($host, $port, $rest1); last foo;
       }
       die "Protocol $protocol not supported!\n";
   }
}

sub http_get {
   local($host,$port,$request) = @_;
   ($handle = &chat'open_port($host, $port))
       || die "chat'open($host,$port): $!\n";
   &chat'print($handle,"GET $request\n")
       || die "chat'print(GET $request): $!\n";
   *S = *chat'S;
       while (<S>) {
               print "$_";
       }
   &chat'close($handle);
}

sub gopher_get {
   local($host,$port,$gtype,$selector,$search) = @_;
   $request = ($search ? "$selector\t$search" : $selector);
   ($handle = &chat'open_port($host, $port))
       || die "chat'open($host,$port): $!\n";
   &chat'print($handle,"$request\n")
       || die "chat'print($request): $!\n";
       *S = *chat'S;

# If this is a binary document, retreive it using sysreads rather
# than <S>

   if ($gtype eq '5' || $gtype eq '9' || $gtype eq 'I') {
           $done = 0;
           $rmask = "";
               vec($rmask,fileno(S),1) = 1;
               do {
                       ($nfound, $rmask) =
                               select($rmask, undef, undef, $timeout);
                       if ($nfound) {
                               $nread = sysread(S, $thisbuf, 1024);
                               if ($nread > 0) {
                                       syswrite(STDOUT, $thisbuf, $nread)
                       || die "Syswrite: $!\n";
                               } else {
                                       $done++;
                               }
                       } else {
                               warn "Timeout\n"; $done++;
                       }
               } until $done;
       }

# This is an ASCII document, and we can get it line-by-line using <S>

       else {
               while (<S>) {
                       last if (/^\.\r\n$/);
                       chop; chop;
                       print "$_\n";
               }
       }
       &chat'close($handle);
}

sub file_get {
   local($host, $port, $path) = @_;

   $localhost = `hostname`;
   if ($host eq $localhost && !defined($port)) {
       open(IN, $path) || die "$path: $!\n";
       while (<IN>) {
           print "$_\n";
       }
       close(IN);
   }
   else {
       &ftp'open($host) || die "Unable to open ftp connection to $host\n";
       &ftp'get($path, "&STDOUT")
           || die "Unable to get file $path from $host\n";
       &ftp'close;
   }
}

sub news_get {
   local($host, $port, $article) = @_;

   ($handle = &chat'open_port($host, $port))
       || die "chat'open($host,$port): $!\n";

   if ($article =~ /^[^<].+@.+[^>]$/) {
       $request = "article <$article>";
   }
   elsif ($article =~ /^<.+@.+>$/) {
       $request = "article $article";
   }
   elsif ($article =~ /^\*$/) {
       die "Only support URLs of the form: news:article\n";
   }
   elsif ($article) {
       die "Only support URLs of the form: news:article\n";
   }
   else {
       die "Bad url\n";
   }

# Read NNTP Connect message

   *S = *chat'S;
   $string = <S>;
   $string =~ /^(\d*) (.*)$/;
   die "NNTP Error: $2\n" unless ($1 eq '200');

# Send request

   &chat'print($handle,"$request\r\n")
       || die "chat'print($request): $!\n";

# Read reply message

   $string = <S>;
   $string =~ /^(\d*) (.*)$/;
   die "NNTP Error: $2\n" unless ($1 eq '220');

# Get article

   while (<S>) {
       last if (/^\.\r\n$/);
       chop; chop;
       print "$_\n";
   }
   &chat'print($handle,"quit\n")
       || die "chat'print(quit): $!\n";
   &chat'close($handle);
}


__END__

-------------------------URL.pl---------------------------------------
#
# URL.pl - package to parse WWW URLs
#
# Jack Lund 9/3/93 [email protected]
#

package url;

# Default port numbers for URL services

$ftp_port = 21;
$http_port = 80;
$gopher_port = 70;
$telnet_port = 23;
$wais_port = 210;
$news_port = 119;

# syntax: &url'parse_url(URL)
# returns array containing following:
#       protocol        protocol string from url. ex: "gopher", "http".
#       host            host that specified protocol server is running on
#       port            port that server answers on
# the rest of the array is protocol-dependant. See code for details.
#

sub parse_url {
   local($url) = @_;

   if ($url =~ m#^(\w+):#) {
       $1 =~ s/[A-Z]/[a-z]/g;
       $protocol = $1;
   } else {
       return undef;
   }

   if ($protocol eq "file" || $protocol eq "ftp") {

# URL of type: file://hostname[:port]/path

       if ($url =~ m#^\s*\w+://([^ \t/:]+):?(\d*)(/.*)$#) {
           $1 =~ s/[A-Z]/[a-z]/;
           $host = $1;
           $port = ($2 ne "" ? $2 : $ftp_port);
           $path = $3;
           return ($protocol, $host, $port, $path);
       }

# URL of type: file:/path

       if ($url =~ m#^\s*\w+:(/.*)$#) {
           $host = `hostname`;  # Current host
           $port = undef;
           return ($protocol, $host, $port, $1);
       }
       return undef;
   }

   if ($protocol eq "news") {

# URL of type: news://host[:port]/article

       if ($url =~ m#^\s*\w+://([^ \t:/]):?(\d*)/(.*)$#) {
           $host = $1;
           $port = ($2 ne "" ? $2 : $news_port);
           $selector = $3;
       }

# URL of type: news:article

       elsif ($url =~ m#^\s*\w+:(.*)$#) {
           $host = $ENV{"NNTPSERVER"};
           unless ($host) {
               warn "Couldn't get NNTP server name\n";
               return undef;
           }
           $port = $news_port;
           $selector = $1;
       }
       else {
           return undef;
       }
       return ($protocol, $host, $port, $selector);
   }

# URL of type: http://host[:port]/path[?search-string]

   if ($protocol eq "http") {
       if ($url =~ m#^\s*\w+://([\w\d\.]+):?(\d*)(/[^ \t\?]+)\??(.)*$#) {
           $1 =~ s/[A-Z]/[a-z]/g;
           $server = $1;
           $port = ($2 ne "" ? $2 : $http_port);
           $path = $3;
           $search = $4;
           return ($protocol, $server, $port, $path, $search);
       }
       return undef;
   }

# URL of type: telnet://user@host[:port]

   if ($protocol eq "telnet") {
       if ($url =~ m#^\s*\w+://([^@]+)@([^: \t]+):?(\d*)$#) {
           $user = $1;
           $2 =~ s/[A-Z]/[a-z]/g;
           $host = $2;
           $port = (defined($3) ? $3 : $telnet_port);
           return($protocol, $host, $port, $user);
       }

# URL of type: telnet://host[:port]

       if ($url =~ m#^\s*\w+://([^: \t]+):?(\d*)$#) {
           $1 =~ s/[A-Z]/[a-z]/g;
           $host = $1;
           $port = (defined($2) ? $2 : $telnet_port);
           return($protocol, $host, $port);
       }
       return undef;
   }

# URL of type: gopher://host[:port]/[gtype]selector-string[?search-string]

   if ($protocol eq "gopher") {
       if ($url =~ m#^\s*\w+://([\w\d\.]+):?(\d*)/(\w?)([^ \t\?]*)\??(.*)$#) {
           $1 =~ s/[A-Z]/[a-z]/g;
           $server = $1;
           $port = ($2 ne "" ? $2 : $gopher_port);
           $gtype = ($3 ne "" ? $3 : 1);
           $selector = $4;
           $search = $5;
           return ($protocol, $server, $port, $gtype, $selector, $search);
       }
       return undef;
   }

# URL of type: wais://host[:port]/database?search-string

   if ($protocol eq "wais") {
       if ($url =~ m#^\s\w+://([\w\d\.]+):?(\d*)/([^\?]+)\??(.*)$#) {
           $1 =~ s/[A-Z]/[a-z]/g;
           $server = $1;
           $port = (defined($2) ? $2 : $wais_port);
           $database = $3;
           $search = $4;
           return ($protocol, $server, $port, $database, $search);
       }
       return undef;
   }
}

-------------------------ftplib.pl---------------------------------------
#
#   This is a set of ftp library routines using chat2.pl
#
#   Return code information taken from RFC 959

#   Written by Gene Spafford  <[email protected]>
#       Last update: 10 April 92,   Version 0.9
#

#
#   Most of these routines communicate over an open ftp channel
#   The channel is opened with the "ftp'open" call.
#

package ftp;
require "chat2.pl";
require "syscall.ph";


###########################################################################
#
#  The following are the variables local to this package.
#  I declare them all up front so I can remember what I called 'em. :-)
#
###########################################################################

LOCAL_VARS: {
   $Control;
   $Data_handle;
   $Host;
   $Myhost = "\0" x 65;
   (syscall(&SYS_gethostname, $Myhost, 65) == 0) ||
       die "Cannot 'gethostname' of local machine (in ftplib)\n";
   $Myhost =~ s/\0*$//;
   $NeedsCleanup;
   $NeedsClose;
   $ftp_error;
   $ftp_matched;
   $ftp_trans_flag;
   @ftp_list;

   local(@tmp) = getservbyname("ftp", "tcp");
   ($FTP = $tmp[2]) ||
       die "Unable to get service number for 'ftp' (in ftplib)!\n";

   @std_actions = (
           'TIMEOUT',
           q($ftp_error = "Connection timed out for $Host!\n"; undef),
           'EOF',
           q($ftp_error = "Connection to $Host timed out unexpectedly!\n"; undef)
   );

   @sigs = ('INT', 'HUP', 'TERM', 'QUIT');  # sigs we'll catch & terminate on
}



###########################################################################
#
#  The following are intended to be the user-callable routines.
#  Each of these does one of the ftp keyword functions.
#
###########################################################################

sub error { ## Public
   $ftp_error;
}

#######################################################

#   cd up a directory level

sub cdup { ## Public
   &do_ftp_cmd(200, "cdup");
}

#######################################################

# close an open ftp connection

sub close { ## Public
   return unless $NeedsClose;
   &do_ftp_cmd(221, "quit");
   &chat'close($Control);
   undef $NeedsClose;
   &do_ftp_signals(0);
}

#######################################################

# change remote directory

sub cwd { ## Public
   &do_ftp_cmd(250, "cwd", @_);
}

#######################################################

#  delete a remote file

sub delete { ## Public
    &do_ftp_cmd(250, "dele", @_);
}

#######################################################

#  get a directory listing of remote directory ("ls -l")

sub dir { ## Public
   &do_ftp_listing("list", @_);
}

#######################################################

#  get a remote file to a local file
#    get(remote[, local])

sub get { ## Public
   local($remote, $local) = @_;
   ($local = $remote) unless $local;

   unless (open(DFILE, ">$local")) {
       $ftp_error =  "Open of local file $local failed: $!";
       return undef;
   } else {
       $NeedsCleanup = $local;
   }

   return undef unless &do_open_dport;         # Open a data channel
   unless (&do_ftp_cmd(150, "retr $remote")) {
       $ftp_error .= "\nFile $remote not fetched from $Host\n";
       close DFILE;
       unlink $local;
       undef $NeedsCleanup;
       return;
   }

   $ftp_trans_flag = 0;

   do {
       &chat'expect($Data_handle, 60,
                    '.|\n', q{print DFILE ($chat'thisbuf) ||
                       ($ftp_trans_flag = 3); undef $chat'S},
                    'EOF',  '$ftp_trans_flag = 1',
                    'TIMEOUT', '$ftp_trans_flag = 2');
   } until $ftp_trans_flag;

   close DFILE;
   &chat'close($Data_handle);          # Close the data channel

   undef $NeedsCleanup;
   if ($ftp_trans_flag > 1) {
       unlink $local;
       $ftp_error = "Unexpected " . ($ftp_trans_flag == 2 ? "timeout" :
               ($ftp_trans_flag != 3 ? "failure" : "local write failure")) .
               " getting $remote\n";
   }

   &do_ftp_cmd(226);
}

#######################################################

#  Do a simple name list ("ls")

sub list { ## Public
   &do_ftp_listing("nlst", @_);
}

#######################################################

#   Make a remote directory

sub mkdir { ## Public
   &do_ftp_cmd(257, "mkd", @_);
}

#######################################################

#  Open an ftp connection to remote host

sub open {  ## Public
   if ($NeedsClose) {
       $ftp_error = "Connection still open to $Host!";
       return undef;
   }

   $Host = shift(@_);
   local($User, $Password, $Acct) = @_;
   $User = "anonymous" unless $User;
   $Password = "-" . $main'ENV{'USER'} . "@$Myhost" unless $Password;
   $ftp_error = '';

   unless($Control = &chat'open_port($Host, $FTP)) {
       $ftp_error = "Unable to connect to $Host ftp port: $!";
       return undef;
   }

   unless(&chat'expect($Control, 60,
                       "^220 .*\n",     "1",
                       "^\d\d\d .*\n",  "undef")) {
       $ftp_error = "Error establishing control connection to $Host";
       &chat'close($Control);
       return undef;
   }
   &do_ftp_signals($NeedsClose = 1);

   unless (&do_ftp_cmd(331, "user $User")) {
       $ftp_error .= "\nUser command failed establishing connection to $Host";
       return undef;
   }

   unless (&do_ftp_cmd("(230|332|202)", "pass $Password")) {
       $ftp_error .= "\nPassword command failed establishing connection to $Host";
       return undef;
   }

   return 1 unless $Acct;

   unless (&do_ftp_cmd("(230|202)", "pass $Password")) {
       $ftp_error .= "\nAcct command failed establishing connection to $Host";
       return undef;
   }
   1;
}

#######################################################

#  Get name of current remote directory

sub pwd { ## Public
   if (&do_ftp_cmd(257, "pwd")) {
       $ftp_matched =~ m/^257 (.+)\r?\n/;
       $1;
   } else {
       undef;
   }
}

#######################################################

#  Rename a remote file

sub rename { ## Public
   local($from, $to) = @_;

   &do_ftp_cmd(350, "rnfr $from") && &do_ftp_cmd(250, "rnto $to");
}

#######################################################

#  Set transfer type

sub type { ## Public
   &do_ftp_cmd(200, "type", @_);
}


###########################################################################
#
#  The following are intended to be utility routines used only locally.
#  Users should not call these directly.
#
###########################################################################

sub do_ftp_cmd {  ## Private
   local($okay, @commands, $val) = @_;

   $commands[0] &&
       &chat'print($Control, join(" ", @commands), "\r\n");

   &chat'expect($Control, 60,
                "^$okay .*\\n",        '$ftp_matched = $&; 1',
                '^(\d)\d\d .*\\n', '($String = $&) =~ y/\r\n//d;
                    $ftp_error = qq{Unexpected reply for ' .
                    "@commands" . ': $String};
                    $1 > 3 ? undef : 1',
                @std_actions
               );
}

#######################################################

sub do_ftp_listing { ## Private
   local(@lcmd) = @_;
   @ftp_list = ();
   $ftp_trans_flag = 0;

   return undef unless &do_open_dport;

   return undef unless &do_ftp_cmd(150, @lcmd);
   do {                        #  Following is grotty, but chat2 makes us do it
       &chat'expect($Data_handle, 30,
               "(.*)\r?\n",    'push(@ftp_list, $1)',
               "EOF",     '$ftp_trans_flag = 1');
   } until $ftp_trans_flag;

   &chat'close($Data_handle);
   return undef unless &do_ftp_cmd(226);

   grep(y/\r\n//d, @ftp_list);
   @ftp_list;
}

#######################################################

sub do_open_dport { ## Private
   local(@foo, $port) = &chat'open_listen;
   ($port, $Data_handle) = splice(@foo, 4, 2);

   unless ($Data_handle) {
       $ftp_error =  "Unable to open data port: $!";
       return undef;
   }

   push(@foo, $port >> 8, $port & 0xff);
   local($myhost) = (join(',', @foo));

   &do_ftp_cmd(200, "port $myhost");
}

#######################################################
#
#  To cleanup after a problem
#

sub do_ftp_abort {
   die unless $NeedsClose;

   &chat'print($Control, "abor", "\r\n");
   &chat'close($Data_handle);
   &chat'expect($Control, 10, '.', undef);
   &chat'close($Control);

   close DFILE;
   unlink($NeedsCleanup) if $NeedsCleanup;
   die;
}

#######################################################
#
#  To set signals to do the abort properly
#

sub do_ftp_signals {
   local($flag, $sig) = @_;

   local ($old, $new) = ('DEFAULT', "ftp'do_ftp_abort");
   $flag || (($old, $new) = ($new, $old));
   foreach $sig (@sigs) {
       ($SIG{$sig} == $old) && ($SIG{$sig} = $new);
   }
}

1;
--
Jack Lund                       Email: [email protected]
Graphics Services               Phone: (512) 471-3241
UT Austin Computation Center
WWW: <A HREF="http://wwwhost.cc.utexas.edu/test/zippy/zippy.html">Zippy</A>!