Article 1597 of comp.infosystems.www:
Xref: feenix.metronet.com comp.infosystems.www:1597
Path: feenix.metronet.com!news.ecn.bgu.edu!usenet.ins.cwru.edu!howland.reston.ans.net!math.ohio-state.edu!cs.utexas.edu!geraldo.cc.utexas.edu!ansel.cc.utexas.edu!not-for-mail
From:
[email protected] (Jack Lund)
Newsgroups: comp.infosystems.www
Subject: Re: How to get gopher files with perl?
Date: 14 Sep 1993 13:42:50 -0500
Organization: The University of Texas - Austin
Lines: 711
Message-ID: <
[email protected]>
References: <
[email protected]>
NNTP-Posting-Host: ansel.cc.utexas.edu
In article <
[email protected]>,
Oscar Nierstrasz <
[email protected]> wrote:
>
>I want to get a large number of gopher files using a shell/perl script
>(actually I don't want to -- Simon Gibbs next door wants to).
>
>The gopher protocol looked simple, so I thought it would be easy.
>It seems to work, but the files returned are short if they are image files (GIF).
>What am I doing wrong? Are there any good pointers to on-line doc?
>(I have seen the gopher protocol.rtf at cern, but it seems to be
>somewhat out of date ...)
>
>I thought all you had to do was connect, send the name of the file you
>want, and gobble up the reply. That doesn't seem to work.
>Are image files compressed or otherwise encoded?
>
>Here is the perl subroutine I am using:
I modified a perl script of yours (remember hget?) to take (just
about) any URL and grab the appropriate document. It works well for
binary & ascii files. You might check it out.
I haven't had a chance to take a close look at your code, but if I
discover anything, I'll let you know.
-------------------------url_get------------------------------------
#!/bin/perl
#
# url_get --- get a document given a WWW URL
#
# Jack Lund 9/3/93
[email protected]
#
# from hget by:
# Oscar Nierstrasz 26/8/93
[email protected]
#
$home = $ENV{"HOME"};
require "chat2.pl";
require "$home/lib/perl/URL.pl";
require "$home/lib/perl/ftplib.pl";
require "getopts.pl";
&Getopts(':b');
die "Usage: url_get <http-url> ...\n" unless $#ARGV >= 0;
$timeout = undef;
foreach $url (@ARGV) {
($protocol, $host, $port, $rest1, $rest2, $rest3) = &url'parse_url($url);
foo:
{
if ($protocol eq "http") {
&http_get($host,$port,$rest1); last foo;
}
if ($protocol eq "gopher") {
&gopher_get($host, $port, $rest1, $rest2, $rest3); last foo;
}
if ($protocol eq "file") {
&file_get($host, $port, $rest1); last foo;
}
if ($protocol eq "news") {
&news_get($host, $port, $rest1); last foo;
}
die "Protocol $protocol not supported!\n";
}
}
sub http_get {
local($host,$port,$request) = @_;
($handle = &chat'open_port($host, $port))
|| die "chat'open($host,$port): $!\n";
&chat'print($handle,"GET $request\n")
|| die "chat'print(GET $request): $!\n";
*S = *chat'S;
while (<S>) {
print "$_";
}
&chat'close($handle);
}
sub gopher_get {
local($host,$port,$gtype,$selector,$search) = @_;
$request = ($search ? "$selector\t$search" : $selector);
($handle = &chat'open_port($host, $port))
|| die "chat'open($host,$port): $!\n";
&chat'print($handle,"$request\n")
|| die "chat'print($request): $!\n";
*S = *chat'S;
# If this is a binary document, retreive it using sysreads rather
# than <S>
if ($gtype eq '5' || $gtype eq '9' || $gtype eq 'I') {
$done = 0;
$rmask = "";
vec($rmask,fileno(S),1) = 1;
do {
($nfound, $rmask) =
select($rmask, undef, undef, $timeout);
if ($nfound) {
$nread = sysread(S, $thisbuf, 1024);
if ($nread > 0) {
syswrite(STDOUT, $thisbuf, $nread)
|| die "Syswrite: $!\n";
} else {
$done++;
}
} else {
warn "Timeout\n"; $done++;
}
} until $done;
}
# This is an ASCII document, and we can get it line-by-line using <S>
else {
while (<S>) {
last if (/^\.\r\n$/);
chop; chop;
print "$_\n";
}
}
&chat'close($handle);
}
sub file_get {
local($host, $port, $path) = @_;
$localhost = `hostname`;
if ($host eq $localhost && !defined($port)) {
open(IN, $path) || die "$path: $!\n";
while (<IN>) {
print "$_\n";
}
close(IN);
}
else {
&ftp'open($host) || die "Unable to open ftp connection to $host\n";
&ftp'get($path, "&STDOUT")
|| die "Unable to get file $path from $host\n";
&ftp'close;
}
}
sub news_get {
local($host, $port, $article) = @_;
($handle = &chat'open_port($host, $port))
|| die "chat'open($host,$port): $!\n";
if ($article =~ /^[^<].+@.+[^>]$/) {
$request = "article <$article>";
}
elsif ($article =~ /^<.+@.+>$/) {
$request = "article $article";
}
elsif ($article =~ /^\*$/) {
die "Only support URLs of the form: news:article\n";
}
elsif ($article) {
die "Only support URLs of the form: news:article\n";
}
else {
die "Bad url\n";
}
# Read NNTP Connect message
*S = *chat'S;
$string = <S>;
$string =~ /^(\d*) (.*)$/;
die "NNTP Error: $2\n" unless ($1 eq '200');
# Send request
&chat'print($handle,"$request\r\n")
|| die "chat'print($request): $!\n";
# Read reply message
$string = <S>;
$string =~ /^(\d*) (.*)$/;
die "NNTP Error: $2\n" unless ($1 eq '220');
# Get article
while (<S>) {
last if (/^\.\r\n$/);
chop; chop;
print "$_\n";
}
&chat'print($handle,"quit\n")
|| die "chat'print(quit): $!\n";
&chat'close($handle);
}
__END__
-------------------------URL.pl---------------------------------------
#
# URL.pl - package to parse WWW URLs
#
# Jack Lund 9/3/93
[email protected]
#
package url;
# Default port numbers for URL services
$ftp_port = 21;
$http_port = 80;
$gopher_port = 70;
$telnet_port = 23;
$wais_port = 210;
$news_port = 119;
# syntax: &url'parse_url(URL)
# returns array containing following:
# protocol protocol string from url. ex: "gopher", "http".
# host host that specified protocol server is running on
# port port that server answers on
# the rest of the array is protocol-dependant. See code for details.
#
sub parse_url {
local($url) = @_;
if ($url =~ m#^(\w+):#) {
$1 =~ s/[A-Z]/[a-z]/g;
$protocol = $1;
} else {
return undef;
}
if ($protocol eq "file" || $protocol eq "ftp") {
# URL of type: file://hostname[:port]/path
if ($url =~ m#^\s*\w+://([^ \t/:]+):?(\d*)(/.*)$#) {
$1 =~ s/[A-Z]/[a-z]/;
$host = $1;
$port = ($2 ne "" ? $2 : $ftp_port);
$path = $3;
return ($protocol, $host, $port, $path);
}
# URL of type: file:/path
if ($url =~ m#^\s*\w+:(/.*)$#) {
$host = `hostname`; # Current host
$port = undef;
return ($protocol, $host, $port, $1);
}
return undef;
}
if ($protocol eq "news") {
# URL of type:
news://host[:port]/article
if ($url =~ m#^\s*\w+://([^ \t:/]):?(\d*)/(.*)$#) {
$host = $1;
$port = ($2 ne "" ? $2 : $news_port);
$selector = $3;
}
# URL of type: news:article
elsif ($url =~ m#^\s*\w+:(.*)$#) {
$host = $ENV{"NNTPSERVER"};
unless ($host) {
warn "Couldn't get NNTP server name\n";
return undef;
}
$port = $news_port;
$selector = $1;
}
else {
return undef;
}
return ($protocol, $host, $port, $selector);
}
# URL of type:
http://host[:port]/path[?search-string]
if ($protocol eq "http") {
if ($url =~ m#^\s*\w+://([\w\d\.]+):?(\d*)(/[^ \t\?]+)\??(.)*$#) {
$1 =~ s/[A-Z]/[a-z]/g;
$server = $1;
$port = ($2 ne "" ? $2 : $http_port);
$path = $3;
$search = $4;
return ($protocol, $server, $port, $path, $search);
}
return undef;
}
# URL of type: telnet://user@host[:port]
if ($protocol eq "telnet") {
if ($url =~ m#^\s*\w+://([^@]+)@([^: \t]+):?(\d*)$#) {
$user = $1;
$2 =~ s/[A-Z]/[a-z]/g;
$host = $2;
$port = (defined($3) ? $3 : $telnet_port);
return($protocol, $host, $port, $user);
}
# URL of type: telnet://host[:port]
if ($url =~ m#^\s*\w+://([^: \t]+):?(\d*)$#) {
$1 =~ s/[A-Z]/[a-z]/g;
$host = $1;
$port = (defined($2) ? $2 : $telnet_port);
return($protocol, $host, $port);
}
return undef;
}
# URL of type:
gopher://host[:port]/[gtype]selector-string[?search-string]
if ($protocol eq "gopher") {
if ($url =~ m#^\s*\w+://([\w\d\.]+):?(\d*)/(\w?)([^ \t\?]*)\??(.*)$#) {
$1 =~ s/[A-Z]/[a-z]/g;
$server = $1;
$port = ($2 ne "" ? $2 : $gopher_port);
$gtype = ($3 ne "" ? $3 : 1);
$selector = $4;
$search = $5;
return ($protocol, $server, $port, $gtype, $selector, $search);
}
return undef;
}
# URL of type: wais://host[:port]/database?search-string
if ($protocol eq "wais") {
if ($url =~ m#^\s\w+://([\w\d\.]+):?(\d*)/([^\?]+)\??(.*)$#) {
$1 =~ s/[A-Z]/[a-z]/g;
$server = $1;
$port = (defined($2) ? $2 : $wais_port);
$database = $3;
$search = $4;
return ($protocol, $server, $port, $database, $search);
}
return undef;
}
}
-------------------------ftplib.pl---------------------------------------
#
# This is a set of ftp library routines using chat2.pl
#
# Return code information taken from RFC 959
# Written by Gene Spafford <
[email protected]>
# Last update: 10 April 92, Version 0.9
#
#
# Most of these routines communicate over an open ftp channel
# The channel is opened with the "ftp'open" call.
#
package ftp;
require "chat2.pl";
require "syscall.ph";
###########################################################################
#
# The following are the variables local to this package.
# I declare them all up front so I can remember what I called 'em. :-)
#
###########################################################################
LOCAL_VARS: {
$Control;
$Data_handle;
$Host;
$Myhost = "\0" x 65;
(syscall(&SYS_gethostname, $Myhost, 65) == 0) ||
die "Cannot 'gethostname' of local machine (in ftplib)\n";
$Myhost =~ s/\0*$//;
$NeedsCleanup;
$NeedsClose;
$ftp_error;
$ftp_matched;
$ftp_trans_flag;
@ftp_list;
local(@tmp) = getservbyname("ftp", "tcp");
($FTP = $tmp[2]) ||
die "Unable to get service number for 'ftp' (in ftplib)!\n";
@std_actions = (
'TIMEOUT',
q($ftp_error = "Connection timed out for $Host!\n"; undef),
'EOF',
q($ftp_error = "Connection to $Host timed out unexpectedly!\n"; undef)
);
@sigs = ('INT', 'HUP', 'TERM', 'QUIT'); # sigs we'll catch & terminate on
}
###########################################################################
#
# The following are intended to be the user-callable routines.
# Each of these does one of the ftp keyword functions.
#
###########################################################################
sub error { ## Public
$ftp_error;
}
#######################################################
# cd up a directory level
sub cdup { ## Public
&do_ftp_cmd(200, "cdup");
}
#######################################################
# close an open ftp connection
sub close { ## Public
return unless $NeedsClose;
&do_ftp_cmd(221, "quit");
&chat'close($Control);
undef $NeedsClose;
&do_ftp_signals(0);
}
#######################################################
# change remote directory
sub cwd { ## Public
&do_ftp_cmd(250, "cwd", @_);
}
#######################################################
# delete a remote file
sub delete { ## Public
&do_ftp_cmd(250, "dele", @_);
}
#######################################################
# get a directory listing of remote directory ("ls -l")
sub dir { ## Public
&do_ftp_listing("list", @_);
}
#######################################################
# get a remote file to a local file
# get(remote[, local])
sub get { ## Public
local($remote, $local) = @_;
($local = $remote) unless $local;
unless (open(DFILE, ">$local")) {
$ftp_error = "Open of local file $local failed: $!";
return undef;
} else {
$NeedsCleanup = $local;
}
return undef unless &do_open_dport; # Open a data channel
unless (&do_ftp_cmd(150, "retr $remote")) {
$ftp_error .= "\nFile $remote not fetched from $Host\n";
close DFILE;
unlink $local;
undef $NeedsCleanup;
return;
}
$ftp_trans_flag = 0;
do {
&chat'expect($Data_handle, 60,
'.|\n', q{print DFILE ($chat'thisbuf) ||
($ftp_trans_flag = 3); undef $chat'S},
'EOF', '$ftp_trans_flag = 1',
'TIMEOUT', '$ftp_trans_flag = 2');
} until $ftp_trans_flag;
close DFILE;
&chat'close($Data_handle); # Close the data channel
undef $NeedsCleanup;
if ($ftp_trans_flag > 1) {
unlink $local;
$ftp_error = "Unexpected " . ($ftp_trans_flag == 2 ? "timeout" :
($ftp_trans_flag != 3 ? "failure" : "local write failure")) .
" getting $remote\n";
}
&do_ftp_cmd(226);
}
#######################################################
# Do a simple name list ("ls")
sub list { ## Public
&do_ftp_listing("nlst", @_);
}
#######################################################
# Make a remote directory
sub mkdir { ## Public
&do_ftp_cmd(257, "mkd", @_);
}
#######################################################
# Open an ftp connection to remote host
sub open { ## Public
if ($NeedsClose) {
$ftp_error = "Connection still open to $Host!";
return undef;
}
$Host = shift(@_);
local($User, $Password, $Acct) = @_;
$User = "anonymous" unless $User;
$Password = "-" . $main'ENV{'USER'} . "@$Myhost" unless $Password;
$ftp_error = '';
unless($Control = &chat'open_port($Host, $FTP)) {
$ftp_error = "Unable to connect to $Host ftp port: $!";
return undef;
}
unless(&chat'expect($Control, 60,
"^220 .*\n", "1",
"^\d\d\d .*\n", "undef")) {
$ftp_error = "Error establishing control connection to $Host";
&chat'close($Control);
return undef;
}
&do_ftp_signals($NeedsClose = 1);
unless (&do_ftp_cmd(331, "user $User")) {
$ftp_error .= "\nUser command failed establishing connection to $Host";
return undef;
}
unless (&do_ftp_cmd("(230|332|202)", "pass $Password")) {
$ftp_error .= "\nPassword command failed establishing connection to $Host";
return undef;
}
return 1 unless $Acct;
unless (&do_ftp_cmd("(230|202)", "pass $Password")) {
$ftp_error .= "\nAcct command failed establishing connection to $Host";
return undef;
}
1;
}
#######################################################
# Get name of current remote directory
sub pwd { ## Public
if (&do_ftp_cmd(257, "pwd")) {
$ftp_matched =~ m/^257 (.+)\r?\n/;
$1;
} else {
undef;
}
}
#######################################################
# Rename a remote file
sub rename { ## Public
local($from, $to) = @_;
&do_ftp_cmd(350, "rnfr $from") && &do_ftp_cmd(250, "rnto $to");
}
#######################################################
# Set transfer type
sub type { ## Public
&do_ftp_cmd(200, "type", @_);
}
###########################################################################
#
# The following are intended to be utility routines used only locally.
# Users should not call these directly.
#
###########################################################################
sub do_ftp_cmd { ## Private
local($okay, @commands, $val) = @_;
$commands[0] &&
&chat'print($Control, join(" ", @commands), "\r\n");
&chat'expect($Control, 60,
"^$okay .*\\n", '$ftp_matched = $&; 1',
'^(\d)\d\d .*\\n', '($String = $&) =~ y/\r\n//d;
$ftp_error = qq{Unexpected reply for ' .
"@commands" . ': $String};
$1 > 3 ? undef : 1',
@std_actions
);
}
#######################################################
sub do_ftp_listing { ## Private
local(@lcmd) = @_;
@ftp_list = ();
$ftp_trans_flag = 0;
return undef unless &do_open_dport;
return undef unless &do_ftp_cmd(150, @lcmd);
do { # Following is grotty, but chat2 makes us do it
&chat'expect($Data_handle, 30,
"(.*)\r?\n", 'push(@ftp_list, $1)',
"EOF", '$ftp_trans_flag = 1');
} until $ftp_trans_flag;
&chat'close($Data_handle);
return undef unless &do_ftp_cmd(226);
grep(y/\r\n//d, @ftp_list);
@ftp_list;
}
#######################################################
sub do_open_dport { ## Private
local(@foo, $port) = &chat'open_listen;
($port, $Data_handle) = splice(@foo, 4, 2);
unless ($Data_handle) {
$ftp_error = "Unable to open data port: $!";
return undef;
}
push(@foo, $port >> 8, $port & 0xff);
local($myhost) = (join(',', @foo));
&do_ftp_cmd(200, "port $myhost");
}
#######################################################
#
# To cleanup after a problem
#
sub do_ftp_abort {
die unless $NeedsClose;
&chat'print($Control, "abor", "\r\n");
&chat'close($Data_handle);
&chat'expect($Control, 10, '.', undef);
&chat'close($Control);
close DFILE;
unlink($NeedsCleanup) if $NeedsCleanup;
die;
}
#######################################################
#
# To set signals to do the abort properly
#
sub do_ftp_signals {
local($flag, $sig) = @_;
local ($old, $new) = ('DEFAULT', "ftp'do_ftp_abort");
$flag || (($old, $new) = ($new, $old));
foreach $sig (@sigs) {
($SIG{$sig} == $old) && ($SIG{$sig} = $new);
}
}
1;
--
Jack Lund Email:
[email protected]
Graphics Services Phone: (512) 471-3241
UT Austin Computation Center
WWW: <A HREF="
http://wwwhost.cc.utexas.edu/test/zippy/zippy.html">Zippy</A>!