# Author: David Griffith <dave@661.org>

#!/usr/bin/env perl

# Author: David Griffith <[email protected]>
# Date: March 4, 2020
# Version: 1.0
# License: Artistic 2.0
#
# Recursively convert Index files into gophermap files.
#
# To make the IF Archive's gopher mirror more pleasant to browse, the
# Index files should be converted into proper gophermap files. This
# script takes a path to the root of the IF Archive. If a path is not
# supplied, then a default of /var/gopher/if-archive is used. If an Index
# file is not present, something intelligent will be done with the files.
#
# This script is intended to be run as a cron(8) job immediately after the
# cron(8) job to update the IF Archive mirror has run. By default, it
# will check to see if the Index file being processed is less than a day
# old. If so, the file will be processed and a new gophermap file
# produced. If the Index file is older than a day, then we assume that it
# hasn't changed since the last update. To force all gophermap files to
# be rewritten, use the -f flag.
#
# For testing purposes, the -s flag can be used to print a single
# gophermap file to standard output. The idea is to point the script at a
# particular directory in the IF Archive and get a gophermap for it.
#
# Option flags:
#
# -? --usage Print simple usage message.
# -h --help Verbose help message.
# -d --dryrun Dry run. Don't write anything.
# -f --force Force. Rewrite gophermap even if Index hasn't changed.
# -s --single Single. Print one gophermap to stdout and exit.
# -v --verbose Verbosity. Say which Index we're working on now.
#
# Examples:
#
# Really just executing "index2gophermap.pl" is enough if the IF Archive
# mirror lives in /var/gopher/if-archive. This default can be changed by
# editing the values of $gopherroot and $ifarchive. Otherwise the Archive
# can be specified on the command line like this:
#
# index2gophermap.pl /export/disk3/gopher/if-archive
#

use strict;
use warnings;
use utf8;
use File::Basename;
use File::Temp qw(tempfile);
use Cwd qw(getcwd);
use Getopt::Long qw(:config no_ignore_case);
use Net::Domain qw(hostname hostfqdn hostdomain);
use Pod::Usage;

use File::LibMagic; # libfile-libmagic-perl
use DateTime; # libdatetime-perl

my $progname = basename($0);
my $version = "0.1";
my $dir = $ARGV[0];
my $hostname = hostname();
my $port = 70;
my $index = "Index";
my $gophermap = "gophermap";
my $gopherroot = "/var/gopher";
my $ifarchive = "if-archive";
my $ifroot = "$gopherroot/$ifarchive";
my $ifdb = "https://ifdb.tads.org/viewgame?id=";
my %options;
my $dt;

my $footer = "\n\n".
"The IF Archive is a public service of the\n".
"Interactive Fiction Technology Foundation.\n".
"hhttp://iftechfoundation.org/\tURL:http://iftechfoundation.org/\n\n".
"This mirror is a public service of 661.org.\n".
"hhttp://661.org/\tURL:http://661.org/\n";

# These extensions are binary, but are detected as text.
# There's got to be a less nasty way to do this.
my @binary = ("pdf", "ps", "lha");

# These should be marked as images in some fashion.
my @image = ("gif", "png", "jpg", "jpeg", "tif", "tiff", "pcx", "bmp", "ico");

# These are for binhex encoded files (primarily for Macs).
# In the IF Archive, there are assorted *.bin files that could be
# MacBinary or TTComp Macintosh archives, a strange ersatz shell archive
# for Unix, or Atari 2600 cartridge images.
my @macbin = ("hqx", "sit", "bin");

# These should be checked if they're for MSDOS.
my @dosexec = ("exe", "com");

# Audio file extensions.
my @audio = ("wav", "au", "aiff", "mp2", "mp3", "ogg");

# HTML file extensions.
my @html = ("html", "htm");

my %audio = map { $_ => 1 } @audio;
my %dosexec = map { $_ => 1 } @dosexec;
my %macbin = map { $_ => 1 } @macbin;
my %image = map { $_ => 1 } @image;
my %binary = map { $_ => 1 } @binary;
my %html = map { $_ => 1 } @html;

# Avoid putting gophermaps in these directories:
my @avoid = ("$ifroot", "$ifroot/unprocessed");
my %avoid = map { $_ => 1 } @avoid;

GetOptions(
'usage|?' => \$options{usage},
'h|help' => \$options{help},
'd|dryrun' => \$options{dryrun},
'f|force' => \$options{force},
's|single' => \$options{single},
'v|verbose' => \$options{verbose},
'V|version' => \$options{version}
);

if ($options{version}) {
print "$progname version $version\n";
exit;
}

pod2usage(1) if $options{usage};
pod2usage(-verbose => 2) if $options{help};

$dt = DateTime->now();
my $starttime = $dt->epoch();
print "$progname: Starting at ";
print $dt->day() . "-" .
$dt->month_abbr() . "-" .
$dt->year() . " " .
$dt->hms(":")."\n";

if ($options{dryrun}) {
print "$progname: Starting dry run.\n";
}
if ($options{force}) {
print "$progname: Forcing rewrite of all gophermaps.\n";
}

if ($ARGV[0]) {
traverse($ARGV[0]);
} else {
traverse($ifroot);
}

my $dt_done = DateTime->now();
print "$progname: Complete at ";
print $dt_done->day() . "-" .
$dt_done->month_abbr() . "-" .
$dt_done->year() . " " .
$dt_done->hms(":")."\n";
print "$progname: Runtime " . (($dt_done->epoch()) - ($starttime)) . " seconds\n";

######################################################################

# Make a gophermap for this directory, then recurse through any
# subdirectories.
#
sub traverse {
my ($dir) = @_;
my $pwd;

chdir $dir;

# On the first call, $dir is a complete path. Subsequently,
# it's just the directory name. So, we need to use getcwd()
# here to make sure to keep things straight.
$pwd = getcwd();

# Make a gophermap for this directory unless it's one of those
# that's not supposed to get one.
if (!exists $avoid{$pwd}) {
make_gophermap();
} elsif ($options{verbose}) {
print "$progname: Avoiding $dir\n";
}

# Check this directory for non-symlink subdirectories.
foreach my $file (glob("*")) {
next if -l $file or $file eq '.' or $file eq '..';
if (-d $file) {
traverse($file);
}
}
chdir "..";
return;
}

# This is where most of the magic happens.
#
# 1. If we have an Index file, proceed only if this Index file is newer
# than X minutes ago.
#
# 2. Get the page header from Index and store it in an array,
# translating any HTML URLs into a gopher URI.
#
# 3. Get the next filename, IFDB hashes (if any), and description (if
# any). Store these line by line in a hash of arrays..
#
# 4. Get lists of all normal files, subdirectories, and symlinks.
#
# 5. Write out the header to a gophermap file.
#
# 6. Write out the list of subdirectories to the gophermap file.
#
# 7. Check a list of files found in Index against the list of all
# normal files. If there's a file missing from Index, aside from
# certain ones we don't want to list, add it to the hash of arrays
# and a list of all files in the hash of array.
#
# 8. Use the list of all files to access the hash of arrays in
# alphabetical order and write out the file entries to the gophermap
# file.
#
sub make_gophermap {
my $pwd = getcwd();
my $index_fh;
my $out_fh;
my $outfile;

my @chunk;
my @header;

my @symlinks;
my @subdirs;
my @files;
my $filename;
my @indexed_files;
my %body;

if ($options{verbose}) {
print "$progname: processing $pwd\n";
}

# If there's an Index file, check it.
if (-f $index) {
my $modtime = -M "$index";
$modtime = $modtime *60*24;

# If the Index file is newer than X minutes ago,
# rewrite the gophermap.
if ($modtime > 30 && !$options{force}) {
if ($options{verbose}) {
print "$progname: skipping $pwd/$index\n";
return;
}
}

if ($options{verbose}) {
print "$progname: rewriting $pwd/$gophermap\n";
}
open ($index_fh, "<", $index) or die "$progname: Unable to read $index. $!\n";

# Get the page header. If there's nothing before the
# first file description, we get nothing.
@header = getchunk($index_fh);

# Parse the list of files in Index and store in a hash
# of arrays.
while ($filename = getfilename($index_fh)) {
push @indexed_files, ($filename);

# If present, the IFDB key is always right after
# the filename.
@chunk = getifdb($index_fh);
if (@chunk) {
my @fixed;
foreach my $line (@chunk) {
push @fixed, ("hIFDB entry\tURL:$ifdb$line");
}
push (@{$body{$filename}}, @fixed);
}

# If there's an HTML URL in the body, we're not
# catching it yet.
@chunk = getchunk($index_fh);
if (@chunk) {
push (@{$body{$filename}}, @chunk);
}
}
close($index_fh);
}

# Regardless of if we have an Index file or not, save lists of
# subdirectories, symlinks, and regular files.
opendir(SUBDIR, $pwd);
while (my $file_test = readdir(SUBDIR)) {
next if $file_test eq '.' or $file_test eq '..';
next if $file_test eq $index or $file_test eq $gophermap;
next if $file_test =~ /^\./;

if (-l $file_test) {
push @symlinks, ($file_test);
} elsif (-d $file_test) {
push @subdirs, ($file_test);
} else {
push @files, ($file_test);
}
}
closedir(SUBDIR);

if ($options{dryrun}) {
$outfile = "/dev/null";
} else {
$outfile = $gophermap;
}

# Now put it all together.
if ($options{single}) {
$out_fh = *STDOUT;
} else {
open ($out_fh, ">", $outfile) or die "$progname: Unable to write to $pwd/$outfile. $!\n";
}

# Print the header.
if (@header) {
foreach my $line (@header) {
print $out_fh "$line\n";
}
}

my $path = $pwd;
$path =~ s/$gopherroot//g;

if (@subdirs) {
my $subdir_count = @subdirs;
print $out_fh "\n\n$subdir_count Subdirector";
if ($subdir_count > 1) {
print $out_fh "ies:\n";
} else {
print $out_fh "y:\n";
}

foreach my $line (sort @subdirs) {
print $out_fh "1" . $line . "\t$path/$line\n";
}
}

if (@indexed_files) {
# Merge unindexed files in with the indexed files and
# their descriptions.
my %indexed_files_hash = map{$_ => 1} @indexed_files;
my @newfiles;
my $count = 0;
foreach my $file (@files) {
if (!exists $indexed_files_hash{$file}) {
push @newfiles, ($file);
}
}
push @indexed_files, @newfiles;
@indexed_files = sort @indexed_files;

# Write out the files and descriptions.
# Timestamp format is defined here.
my $items = @indexed_files;
print $out_fh "\n\n$items item";
if ($items > 1) {
print $out_fh "s";
}
print $out_fh ":\n";

foreach my $file (@indexed_files) {
$dt = DateTime->from_epoch(epoch => (stat($file))[9]);
print $out_fh filetype($file)."$file\t$path/$file\n";
print $out_fh "[" . $dt->day() . "-" .
$dt->month_abbr() . "-" .
$dt->year() . "]\n";
foreach my $line (@{$body{$file}}) {
# Bullet points
if ($line =~ /^-/) {
$line =~ s/^-\s*//;
print $out_fh "i$line\tfoo\t$port\n";
} else {
print $out_fh "$line\n";
}
}
$count++;
if ($count != @indexed_files) {
print $out_fh "\n";
}
}
} elsif (@files) {
# We don't have an Index, so just print file with gopher
# descriptors.
# Timestamp format is defined here.
my $items = @files;
my $count = 0;
print $out_fh "\n\n$items item";
if ($items > 1) {
print $out_fh "s";
}
print $out_fh ":\n";
foreach my $file (@files) {
$dt = DateTime->from_epoch(epoch => (stat($file))[9]);
print $out_fh filetype($file)."$file\t$path/$file\n";
print $out_fh "[" . $dt->day() . "-" .
$dt->month_abbr() . "-" .
$dt->year() . "]\n\n";
}
}

print $out_fh $footer;

if ($options{single}) {
exit;
}

close($out_fh);
return;
}

###################################################################
# These functions below are all called only from make_gophermap()
# or subordinate functions.
###################################################################

# Return the standard gopher filetype for the supplied filename.
# See https://gopher.zone/posts/how-to-gophermap/
#
# According to
# https://dev.to/dotcomboom/the-gopher-protocol-in-brief-1d88, only .wav
# files should be given an item type of s. All other audio formats
# should have an item type of 9. Not sure how correct this is.
#
sub filetype {
my ($file, @junk) = @_;
my $ext;

($ext) = $file =~ /(\.[^.]+)$/;
if (!$ext) {
$ext = "NeVeRhApPeNs";
} else {
$ext =~ s/\.//;
$ext = lc($ext);
}

if (exists $html{$ext}) {
return "h";
} elsif (exists $audio{$ext}) {
my $magic = File::LibMagic->new();
my $info = $magic->info_from_filename($file);
my $description = substr($info->{description}, 0, 22);
if ($description =~ /^RIFF.*/ or $description =~ /Sun\/NeXT\saudio\sdata/) {
return "s";
}
elsif ($description =~ /IFF\sdata,\sAIFF\saudio/) {
return "s";
}
elsif ($description =~ /^Ogg.*/ or $description =~ /MPEG\sADTS,\slayer II.*/) {
return "s";
}
return 9;
} elsif (exists $dosexec{$ext}) {
my $magic = File::LibMagic->new();
my $info = $magic->info_from_filename($file);
my $description = substr($info->{description}, 0, 20);
if ($description =~ /^DOS.*/ or $description =~ /^MS-DOS.*/) {
return 5;
} elsif ($description =~ /^ASCII.*/) {
return 0;
}
return 9;
} elsif (exists $macbin{$ext}) {
# In the IF Archive, there are assorted *.bin files that
# could be MacBinary or TTComp Macintosh archives, a
# strange ersatz shell archive for Unix, or Atari 2600
# cartridge images.
if ($ext eq "bin") {
my $magic = File::LibMagic->new();
my $info = $magic->info_from_filename($file);
my $description = substr($info->{description}, 0, 20);
if ($description =~ /^MacBinary.*/ or
$description =~ /^TTComp.*/) {
return 4; # Macintosh archives.
} elsif ($description =~ /^POSIX.*/) {
return 0; # Text for a shell archive
}
return 9; # Generic binary.
}
return 4; # Other Macintosh archives.
} elsif (exists $image{$ext}) {
if ($ext eq "gif") {
return "g";
}
return "I";
} elsif (exists $binary{$ext}) {
return 9;
} elsif (-d $file) {
return 1;
} elsif (!-T $file) {
return 9;
}
return 0;
}

# Get a key to an IFDB entry. If present, it will always be on the next
# line after a filename. If there's something on the next line after an
# IFDB key, it'll be another IFDB key.
#
sub getifdb {
my $fh = shift;
my @tuids;
my $got_tuid = 0;
my $pos;
my $line;

$line = <$fh>;
if ($line =~ /^tuid:/) {
$line =~ s/^tuid:\s*//;
chomp($line);
push @tuids, ($line);
while (my $nextline = <$fh>) {
if ($nextline =~ /^\s*$/) {
return @tuids;
} else {
$nextline =~ s/^\s*//;
chomp($nextline);
push @tuids, ($nextline);
}
}
}
return;
}

# Return the next filename or nothing if not found (EOF).
# Filename is marked like this: /^#\sfilename/
#
sub getfilename {
my $fh = shift;
my $line;

while (my $line = <$fh>) {
if ($line =~ /^#/) {
chomp($line);
$line =~ s/^#\s*//;
return $line;
}
}
return;
}

# Return a chunk of text in the form of an array, one line per element,
# describing a file.
#
sub getchunk {
my $fh = shift;
my $line;
my $topdone;
my $filename;
my @chunk;
my $pos;

# Get text until the next hashmark or EOF.
$pos = tell $fh;
while ($line = <$fh>) {
# Get rid of whitespace at top of file.
if (($line =~ /^\s*$/) && !$topdone) { next; }
# If EOF, simply return what we got.
if (!$line) {
return @chunk;
}
# A hashmark indicates the next file, so if that's what we
# have, go back to the previous line and terminate the loop.
if ($line =~ /^#/) {
seek $fh, $pos, 0;
last;
}
# Angle brackets mark URLs
if ($line =~ /\<.*\>/) {
my @result;
chomp($line);
$line =~ s/^[^\<]*\<//;
$line =~ s/\>.*//;
$line = "h" . $line . "\tURL:$line";
}
$topdone = 1; # We're past any leading blank lines.
chomp($line);
push @chunk, $line;
$pos = tell $fh;
}
# Get rid of trailing blank lines.
foreach my $foo (@chunk) {
if ($chunk[-1] =~ /^\s*$/) {
pop @chunk;
}
}
return @chunk;
}

__END__

=head1 NAME

index2gophermap.pl - Recursively convert Index files into gophermap files

=head1 SYNOPSIS

index2gophermap.pl [options] [<directory>]

=head1 DESCRIPTION

To make the IF Archive's gopher mirror more pleasant to browse, the
Index files should be converted into proper gophermap files. This
script takes a path to the root of the IF Archive. If a path is not
supplied, then a default of /var/gopher/if-archive is used. If an Index
file is not present, something intelligent will be done with the files.

This script is intended to be run as a cron(8) job immediately after the
cron(8) job to update the IF Archive mirror has run. By default, it
will check to see if the Index file being processed is less than a day
old. If so, the file will be processed and a new gophermap file
produced. If the Index file is older than a day, then we assume that it
hasn't changed since the last update. To force all gophermap files to
be rewritten, use the -f flag.

For testing purposes, the -s flag can be used to print a single
gophermap file to standard output. The idea is to point the script at a
particular directory in the IF Archive and get a gophermap for it.

=head2 Option flags

-? --usage Print simple usage message.
-h --help Verbose help message.
-d --dryrun Dry run. Don't write anything.
-f --force Force. Rewrite gophermap even if Index hasn't changed.
-s --single Single. Print one gophermap to stdout and exit.
-v --verbose Verbosity. Say which Index we're working on now.

=head2 Examples

Really just executing "index2gophermap.pl" is enough if the IF Archive
mirror lives in /var/gopher/if-archive. This default can be changed by
editing the values of $gopherroot and $ifarchive. Otherwise the Archive
can be specified on the command line like this:

index2gophermap.pl /export/disk3/gopher/if-archive

=head1 LICENSE

Artistic License 2.0

=head1 AUTHOR

David Griffith <[email protected]>