#!/usr/bin/env perl

# Author:       David Griffith <[email protected]>
# Date:         March 4, 2020
# Version:      1.0
# License:      Artistic 2.0
#
# Recursively convert Index files into gophermap files.
#
# To make the IF Archive's gopher mirror more pleasant to browse, the
# Index files should be converted into proper gophermap files.  This
# script takes a path to the root of the IF Archive.  If a path is not
# supplied, then a default of /var/gopher/if-archive is used.  If an Index
# file is not present, something intelligent will be done with the files.
#
# This script is intended to be run as a cron(8) job immediately after the
# cron(8) job to update the IF Archive mirror has run.  By default, it
# will check to see if the Index file being processed is less than a day
# old.  If so, the file will be processed and a new gophermap file
# produced.  If the Index file is older than a day, then we assume that it
# hasn't changed since the last update.  To force all gophermap files to
# be rewritten, use the -f flag.
#
# For testing purposes, the -s flag can be used to print a single
# gophermap file to standard output.  The idea is to point the script at a
# particular directory in the IF Archive and get a gophermap for it.
#
# Option flags:
#
#  -?  --usage           Print simple usage message.
#  -h  --help            Verbose help message.
#  -d  --dryrun          Dry run.  Don't write anything.
#  -f  --force           Force. Rewrite gophermap even if Index hasn't changed.
#  -s  --single          Single. Print one gophermap to stdout and exit.
#  -v  --verbose         Verbosity. Say which Index we're working on now.
#
# Examples:
#
# Really just executing "index2gophermap.pl" is enough if the IF Archive
# mirror lives in /var/gopher/if-archive.  This default can be changed by
# editing the values of $gopherroot and $ifarchive.  Otherwise the Archive
# can be specified on the command line like this:
#
#   index2gophermap.pl /export/disk3/gopher/if-archive
#

use strict;
use warnings;
use utf8;
use File::Basename;
use File::Temp qw(tempfile);
use Cwd qw(getcwd);
use Getopt::Long qw(:config no_ignore_case);
use Net::Domain qw(hostname hostfqdn hostdomain);
use Pod::Usage;

use File::LibMagic;     # libfile-libmagic-perl
use DateTime;           # libdatetime-perl

my $progname = basename($0);
my $version = "0.1";
my $dir = $ARGV[0];
my $hostname = hostname();
my $port = 70;
my $index = "Index";
my $gophermap = "gophermap";
my $gopherroot = "/var/gopher";
my $ifarchive = "if-archive";
my $ifroot = "$gopherroot/$ifarchive";
my $ifdb = "https://ifdb.tads.org/viewgame?id=";
my %options;
my $dt;

my $footer =    "\n\n".
               "The IF Archive is a public service of the\n".
               "Interactive Fiction Technology Foundation.\n".
               "hhttp://iftechfoundation.org/\tURL:http://iftechfoundation.org/\n\n".
               "This mirror is a public service of 661.org.\n".
               "hhttp://661.org/\tURL:http://661.org/\n";

# These extensions are binary, but are detected as text.
# There's got to be a less nasty way to do this.
my @binary = ("pdf", "ps", "lha");

# These should be marked as images in some fashion.
my @image = ("gif", "png", "jpg", "jpeg", "tif", "tiff", "pcx", "bmp", "ico");

# These are for binhex encoded files (primarily for Macs).
# In the IF Archive, there are assorted *.bin files that could be
# MacBinary or TTComp Macintosh archives, a strange ersatz shell archive
# for Unix, or Atari 2600 cartridge images.
my @macbin = ("hqx", "sit", "bin");

# These should be checked if they're for MSDOS.
my @dosexec = ("exe", "com");

# Audio file extensions.
my @audio = ("wav", "au", "aiff", "mp2", "mp3", "ogg");

# HTML file extensions.
my @html = ("html", "htm");

my %audio       = map { $_ => 1 } @audio;
my %dosexec     = map { $_ => 1 } @dosexec;
my %macbin      = map { $_ => 1 } @macbin;
my %image       = map { $_ => 1 } @image;
my %binary      = map { $_ => 1 } @binary;
my %html        = map { $_ => 1 } @html;


# Avoid putting gophermaps in these directories:
my @avoid = ("$ifroot", "$ifroot/unprocessed");
my %avoid = map { $_ => 1 } @avoid;


GetOptions(
       'usage|?' => \$options{usage},
       'h|help' => \$options{help},
       'd|dryrun' => \$options{dryrun},
       'f|force' => \$options{force},
       's|single' => \$options{single},
       'v|verbose' => \$options{verbose},
       'V|version' => \$options{version}
       );

if ($options{version}) {
       print "$progname version $version\n";
       exit;
}

pod2usage(1) if $options{usage};
pod2usage(-verbose => 2) if $options{help};

$dt = DateTime->now();
my $starttime = $dt->epoch();
print "$progname: Starting at ";
print   $dt->day() . "-" .
       $dt->month_abbr() . "-" .
       $dt->year() . " " .
       $dt->hms(":")."\n";


if ($options{dryrun}) {
       print "$progname: Starting dry run.\n";
}
if ($options{force}) {
       print "$progname: Forcing rewrite of all gophermaps.\n";
}

if ($ARGV[0]) {
       traverse($ARGV[0]);
} else {
       traverse($ifroot);
}

my $dt_done = DateTime->now();
print "$progname: Complete at ";
print   $dt_done->day() . "-" .
       $dt_done->month_abbr() . "-" .
       $dt_done->year() . " " .
       $dt_done->hms(":")."\n";
print "$progname: Runtime " . (($dt_done->epoch()) - ($starttime)) . " seconds\n";


######################################################################

# Make a gophermap for this directory, then recurse through any
# subdirectories.
#
sub traverse {
       my ($dir) = @_;
       my $pwd;

       chdir $dir;

       # On the first call, $dir is a complete path.  Subsequently,
       # it's just the directory name.  So, we need to use getcwd()
       # here to make sure to keep things straight.
       $pwd = getcwd();

       # Make a gophermap for this directory unless it's one of those
       # that's not supposed to get one.
       if (!exists $avoid{$pwd}) {
               make_gophermap();
       } elsif ($options{verbose}) {
                print "$progname: Avoiding $dir\n";
       }

       # Check this directory for non-symlink subdirectories.
       foreach my $file (glob("*")) {
               next if -l $file or $file eq '.' or $file eq '..';
               if (-d $file) {
                       traverse($file);
               }
       }
       chdir "..";
       return;
}


# This is where most of the magic happens.
#
# 1.  If we have an Index file, proceed only if this Index file is newer
#     than X minutes ago.
#
# 2.  Get the page header from Index and store it in an array,
#     translating any HTML URLs into a gopher URI.
#
# 3.  Get the next filename, IFDB hashes (if any), and description (if
#     any).  Store these line by line in a hash of arrays..
#
# 4.  Get lists of all normal files, subdirectories, and symlinks.
#
# 5.  Write out the header to a gophermap file.
#
# 6.  Write out the list of subdirectories to the gophermap file.
#
# 7.  Check a list of files found in Index against the list of all
#     normal files.  If there's a file missing from Index, aside from
#     certain ones we don't want to list, add it to the hash of arrays
#     and a list of all files in the hash of array.
#
# 8.  Use the list of all files to access the hash of arrays in
#     alphabetical order and write out the file entries to the gophermap
#     file.
#
sub make_gophermap {
       my $pwd = getcwd();
       my $index_fh;
       my $out_fh;
       my $outfile;

       my @chunk;
       my @header;

       my @symlinks;
       my @subdirs;
       my @files;
       my $filename;
       my @indexed_files;
       my %body;

       if ($options{verbose}) {
               print "$progname: processing $pwd\n";
       }

       # If there's an Index file, check it.
       if (-f $index) {
               my $modtime = -M "$index";
               $modtime = $modtime *60*24;

               # If the Index file is newer than X minutes ago,
               # rewrite the gophermap.
               if ($modtime > 30 && !$options{force}) {
                       if ($options{verbose}) {
                               print "$progname: skipping $pwd/$index\n";
                               return;
                       }
               }

               if ($options{verbose}) {
                       print "$progname: rewriting $pwd/$gophermap\n";
               }
               open ($index_fh, "<", $index) or die "$progname: Unable to read $index. $!\n";

               # Get the page header. If there's nothing before the
               # first file description, we get nothing.
               @header = getchunk($index_fh);

               # Parse the list of files in Index and store in a hash
               # of arrays.
               while ($filename = getfilename($index_fh)) {
                       push @indexed_files, ($filename);

                       # If present, the IFDB key is always right after
                       # the filename.
                       @chunk = getifdb($index_fh);
                       if (@chunk) {
                               my @fixed;
                               foreach my $line (@chunk) {
                                       push @fixed, ("hIFDB entry\tURL:$ifdb$line");
                               }
                               push (@{$body{$filename}}, @fixed);
                       }

                       # If there's an HTML URL in the body, we're not
                       # catching it yet.
                       @chunk = getchunk($index_fh);
                       if (@chunk) {
                               push (@{$body{$filename}}, @chunk);
                       }
               }
               close($index_fh);
       }

       # Regardless of if we have an Index file or not, save lists of
       # subdirectories, symlinks, and regular files.
       opendir(SUBDIR, $pwd);
       while (my $file_test = readdir(SUBDIR)) {
               next if $file_test eq '.' or $file_test eq '..';
               next if $file_test eq $index or $file_test eq $gophermap;
               next if $file_test =~ /^\./;

               if (-l $file_test) {
                       push @symlinks, ($file_test);
               } elsif (-d $file_test) {
                       push @subdirs, ($file_test);
               } else {
                       push @files, ($file_test);
               }
       }
       closedir(SUBDIR);

       if ($options{dryrun}) {
               $outfile = "/dev/null";
       } else {
               $outfile = $gophermap;
       }

       # Now put it all together.
       if ($options{single}) {
               $out_fh = *STDOUT;
       } else {
               open ($out_fh, ">", $outfile) or die "$progname: Unable to write to $pwd/$outfile. $!\n";
       }

       # Print the header.
       if (@header) {
               foreach my $line (@header) {
                       print $out_fh "$line\n";
               }
       }

       my $path = $pwd;
       $path =~ s/$gopherroot//g;

       if (@subdirs) {
               my $subdir_count = @subdirs;
               print $out_fh "\n\n$subdir_count Subdirector";
               if ($subdir_count > 1) {
                       print $out_fh "ies:\n";
               } else {
                       print $out_fh "y:\n";
               }

               foreach my $line (sort @subdirs) {
                       print $out_fh "1" . $line . "\t$path/$line\n";
               }
       }

       if (@indexed_files) {
               # Merge unindexed files in with the indexed files and
               # their descriptions.
               my %indexed_files_hash = map{$_ => 1} @indexed_files;
               my @newfiles;
               my $count = 0;
               foreach my $file (@files) {
                       if (!exists $indexed_files_hash{$file}) {
                               push @newfiles, ($file);
                       }
               }
               push @indexed_files, @newfiles;
               @indexed_files = sort @indexed_files;

               # Write out the files and descriptions.
               # Timestamp format is defined here.
               my $items = @indexed_files;
               print $out_fh "\n\n$items item";
               if ($items > 1) {
                       print $out_fh "s";
               }
               print $out_fh ":\n";

               foreach my $file (@indexed_files) {
                       $dt = DateTime->from_epoch(epoch => (stat($file))[9]);
                       print $out_fh filetype($file)."$file\t$path/$file\n";
                       print $out_fh "[" .     $dt->day() . "-" .
                                               $dt->month_abbr() . "-" .
                                               $dt->year() . "]\n";
                       foreach my $line (@{$body{$file}}) {
                               # Bullet points
                               if ($line =~ /^-/) {
                                       $line =~ s/^-\s*//;
                                       print $out_fh "i$line\tfoo\t$port\n";
                               } else {
                                       print $out_fh "$line\n";
                               }
                       }
                       $count++;
                       if ($count != @indexed_files) {
                               print $out_fh "\n";
                       }
               }
       } elsif (@files) {
               # We don't have an Index, so just print file with gopher
               # descriptors.
               # Timestamp format is defined here.
               my $items = @files;
               my $count = 0;
               print $out_fh "\n\n$items item";
               if ($items > 1) {
                       print $out_fh "s";
               }
               print $out_fh ":\n";
               foreach my $file (@files) {
                       $dt = DateTime->from_epoch(epoch => (stat($file))[9]);
                       print $out_fh filetype($file)."$file\t$path/$file\n";
                       print $out_fh "[" .     $dt->day() . "-" .
                                               $dt->month_abbr() . "-" .
                                               $dt->year() . "]\n\n";
               }
       }

       print $out_fh $footer;

       if ($options{single}) {
               exit;
       }

       close($out_fh);
       return;
}


###################################################################
# These functions below are all called only from make_gophermap()
# or subordinate functions.
###################################################################


# Return the standard gopher filetype for the supplied filename.
# See https://gopher.zone/posts/how-to-gophermap/
#
# According to
# https://dev.to/dotcomboom/the-gopher-protocol-in-brief-1d88, only .wav
# files should be given an item type of s.  All other audio formats
# should have an item type of 9.  Not sure how correct this is.
#
sub filetype {
       my ($file, @junk) = @_;
       my $ext;

       ($ext) = $file =~ /(\.[^.]+)$/;
       if (!$ext) {
               $ext = "NeVeRhApPeNs";
       } else {
               $ext =~ s/\.//;
               $ext = lc($ext);
       }

       if (exists $html{$ext}) {
               return "h";
       } elsif (exists $audio{$ext}) {
               my $magic = File::LibMagic->new();
               my $info = $magic->info_from_filename($file);
               my $description = substr($info->{description}, 0, 22);
               if ($description =~ /^RIFF.*/ or $description =~ /Sun\/NeXT\saudio\sdata/) {
                       return "s";
               }
               elsif ($description =~ /IFF\sdata,\sAIFF\saudio/) {
                       return "s";
               }
               elsif ($description =~ /^Ogg.*/ or $description =~ /MPEG\sADTS,\slayer II.*/) {
                       return "s";
               }
               return 9;
       } elsif (exists $dosexec{$ext}) {
               my $magic = File::LibMagic->new();
               my $info = $magic->info_from_filename($file);
               my $description = substr($info->{description}, 0, 20);
               if ($description =~ /^DOS.*/ or $description =~ /^MS-DOS.*/) {
                       return 5;
               } elsif ($description =~ /^ASCII.*/) {
                       return 0;
               }
               return 9;
       } elsif (exists $macbin{$ext}) {
               # In the IF Archive, there are assorted *.bin files that
               # could be MacBinary or TTComp Macintosh archives, a
               # strange ersatz shell archive for Unix, or Atari 2600
               # cartridge images.
               if ($ext eq "bin") {
                       my $magic = File::LibMagic->new();
                       my $info = $magic->info_from_filename($file);
                       my $description = substr($info->{description}, 0, 20);
                       if ($description =~ /^MacBinary.*/ or
                           $description =~ /^TTComp.*/) {
                               return 4;       # Macintosh archives.
                       } elsif ($description =~ /^POSIX.*/) {
                               return 0;       # Text for a shell archive
                       }
                       return 9;               # Generic binary.
               }
               return 4;       # Other Macintosh archives.
       } elsif (exists $image{$ext}) {
               if ($ext eq "gif") {
                       return "g";
               }
               return "I";
       } elsif (exists $binary{$ext}) {
               return 9;
       } elsif (-d $file) {
               return 1;
       } elsif (!-T $file) {
               return 9;
       }
       return 0;
}


# Get a key to an IFDB entry.  If present, it will always be on the next
# line after a filename.  If there's something on the next line after an
# IFDB key, it'll be another IFDB key.
#
sub getifdb {
       my $fh = shift;
       my @tuids;
       my $got_tuid = 0;
       my $pos;
       my $line;

       $line = <$fh>;
       if ($line =~ /^tuid:/) {
               $line =~ s/^tuid:\s*//;
               chomp($line);
               push @tuids, ($line);
               while (my $nextline = <$fh>) {
                       if ($nextline =~ /^\s*$/) {
                               return @tuids;
                       } else {
                               $nextline =~ s/^\s*//;
                               chomp($nextline);
                               push @tuids, ($nextline);
                       }
               }
       }
       return;
}


# Return the next filename or nothing if not found (EOF).
# Filename is marked like this: /^#\sfilename/
#
sub getfilename {
       my $fh = shift;
       my $line;

       while (my $line = <$fh>) {
               if ($line =~ /^#/) {
                       chomp($line);
                       $line =~ s/^#\s*//;
                       return $line;
               }
       }
       return;
}


# Return a chunk of text in the form of an array, one line per element,
# describing a file.
#
sub getchunk {
       my $fh = shift;
       my $line;
       my $topdone;
       my $filename;
       my @chunk;
       my $pos;

       # Get text until the next hashmark or EOF.
       $pos = tell $fh;
       while ($line = <$fh>) {
               # Get rid of whitespace at top of file.
               if (($line =~ /^\s*$/) && !$topdone) { next; }
               # If EOF, simply return what we got.
               if (!$line) {
                       return @chunk;
               }
               # A hashmark indicates the next file, so if that's what we
               # have, go back to the previous line and terminate the loop.
               if ($line =~ /^#/) {
                       seek $fh, $pos, 0;
                       last;
               }
               # Angle brackets mark URLs
               if ($line =~ /\<.*\>/) {
                       my @result;
                       chomp($line);
                       $line =~ s/^[^\<]*\<//;
                       $line =~ s/\>.*//;
                       $line = "h" . $line . "\tURL:$line";
               }
               $topdone = 1;   # We're past any leading blank lines.
               chomp($line);
               push @chunk, $line;
               $pos = tell $fh;
       }
       # Get rid of trailing blank lines.
       foreach my $foo (@chunk) {
               if ($chunk[-1] =~ /^\s*$/) {
                       pop @chunk;
               }
       }
       return @chunk;
}


__END__


=head1 NAME

index2gophermap.pl - Recursively convert Index files into gophermap files

=head1 SYNOPSIS

index2gophermap.pl [options] [<directory>]

=head1 DESCRIPTION

To make the IF Archive's gopher mirror more pleasant to browse, the
Index files should be converted into proper gophermap files.  This
script takes a path to the root of the IF Archive.  If a path is not
supplied, then a default of /var/gopher/if-archive is used.  If an Index
file is not present, something intelligent will be done with the files.

This script is intended to be run as a cron(8) job immediately after the
cron(8) job to update the IF Archive mirror has run.  By default, it
will check to see if the Index file being processed is less than a day
old.  If so, the file will be processed and a new gophermap file
produced.  If the Index file is older than a day, then we assume that it
hasn't changed since the last update.  To force all gophermap files to
be rewritten, use the -f flag.

For testing purposes, the -s flag can be used to print a single
gophermap file to standard output.  The idea is to point the script at a
particular directory in the IF Archive and get a gophermap for it.

=head2 Option flags

 -?  --usage           Print simple usage message.
 -h  --help            Verbose help message.
 -d  --dryrun          Dry run.  Don't write anything.
 -f  --force           Force. Rewrite gophermap even if Index hasn't changed.
 -s  --single          Single. Print one gophermap to stdout and exit.
 -v  --verbose         Verbosity. Say which Index we're working on now.

=head2 Examples

Really just executing "index2gophermap.pl" is enough if the IF Archive
mirror lives in /var/gopher/if-archive.  This default can be changed by
editing the values of $gopherroot and $ifarchive.  Otherwise the Archive
can be specified on the command line like this:

 index2gophermap.pl /export/disk3/gopher/if-archive


=head1 LICENSE

Artistic License 2.0

=head1 AUTHOR

David Griffith <[email protected]>