#!/usr/bin/tclsh

# gopher-get.tcl version 4 by Ben Collver
# Short script to mirror a gopher site.
# It works similar to `wget --mirror`.

package require TclCurl
package require Tclx
package require uri

proc curl_exit {ch code} {
   $ch cleanup
   exit $code
}

proc download_delay {} {
   # be a good bot and sleep between downloads (uses Tclx [sleep])
   sleep 1
}

# fetch a gopher document by URL
# if a directory index has been fetched,
# then return a file name for the contents

proc fetch {ch url} {
   set parts [gopher_uri_split $url]
   set parsed [dict create {*}$parts]

   set host [dict get $parsed fqdn]
   set type [dict get $parsed type]
   set path [dict get $parsed path]

   if {[string length $host] == 0 ||
       [string length $type] == 0 ||
       [string length $path] == 0
   } {
       puts "Could not parse URL $url"
       return ""
   }

   if {$type eq "0"} {
       if {[string index $path end] eq "/"} {
           set filename "index.gph"
           set file [format "%s/%s%s%s" $host $type $path $filename]
       } else {
           set file [format "%s/%s%s" $host $type $path]
       }
   } elseif {$type eq "1"} {
       if {[string index $path end] ne "/"} {
           set path "$path/"
       }
       set filename "index.gph"
       set file [format "%s/%s%s%s" $host $type $path $filename]
   } elseif {$type eq "9"} {
       if {[string index $path end] eq "/"} {
           puts "Bad binary filename $path"
           return ""
       } else {
           set file [format "%s/%s%s" $host $type $path]
       }
   } elseif {$type eq "I" || $type eq "g"} {
       if {[string index $path end] eq "/"} {
           puts "Bad image filename $path"
           return ""
       }
       set file [format "%s/%s%s" $host $type $path]
   } else {
       puts "Unknown item type $type"
       return ""
   }

   set dirname [file dirname $file]
   if {![file exists $dirname]} {
       puts "mkdir $dirname"
       file mkdir $dirname
   }
   if {![file exists $file]} {
       $ch configure -file $file -url $url
       puts "Downloading $url ..."
       set result [$ch perform]
       if {$result != 0} {
           puts "Curl error $result"
       }
       download_delay
   }
   if {$type eq "1"} {
       set retval $file
   } else {
       set retval ""
   }
   return $retval
}

# gopher_uri_pattern_orig returns a regular expression to parse gopher URL's
# the same pattern exists in $::uri::gopher::url
# this unused procedure is here to compare with gopher_uri_pattern

proc gopher_uri_pattern_orig {} {
   set escape $::uri::basic::escape
   set hostOrPort $::uri::basic::hostOrPort
   set search $::uri::http::search

   # make $xCharN the same as $::uri::basic::xCharN
   set xCharN {[a-zA-Z0-9$_.+!*'(,);/?:@&=-]}

   # make $xChar the same as $::uri::basic::xChar
   set xChar "(${xCharN}|${escape})"

   # make $url the same as $::uri::gopher::url
   set type $xChar
   set selector "$xChar*"
   set string $selector
   set schemepart  "//${hostOrPort}(/(${type}(${selector}(%09${search}(%09${string})?)?)?)?)?"
   set url "gopher:${schemepart}"

   return $::uri::gopher::url
}

# gopher_uri_pattern returns a regular expression to parse gopher URL's
# a similar regular expression can be found in $::uri::gopher::url
# this version is the same, except it adds a tilde as an acceptable
# character.
#
# Why didn't tcllib uri allow tilde in the first place?
# See link below for pedantic reasons against allowing tilde.
# https://jkorpela.fi/tilde.html

proc gopher_uri_pattern {} {
   set escape $::uri::basic::escape
   set hostOrPort $::uri::basic::hostOrPort
   set search $::uri::http::search

   # add tilde "~" to $xCharN
   set xCharN {[a-zA-Z0-9$_.+~!*'(,);/?:@&=-]}

   # add tilde "~" to $xChar
   set xChar "(${xCharN}|${escape})"

   # add tilde "~" to $url
   set type $xChar
   set selector "$xChar*"
   set string $selector
   set schemepart  "//${hostOrPort}(/(${type}(${selector}(%09${search}(%09${string})?)?)?)?)?"
   set url "gopher:${schemepart}"

   return $url
}

proc gopher_uri_split {url} {
   set pattern [gopher_uri_pattern]
   set parts [regexp -inline $pattern $url]

   # protect against relative path abuse
   set path [lindex $parts 11]
   set path [string map [list "/../" "/dotdot/"] $path]

   set retval [list                 \
       url       [lindex $parts 0]  \
       fqdn      [lindex $parts 1]  \
       fqdn2     [lindex $parts 2]  \
       host_dot  [lindex $parts 3]  \
       host      [lindex $parts 4]  \
       domain    [lindex $parts 5]  \
       unused1   [lindex $parts 6]  \
       unused2   [lindex $parts 7]  \
       req_abs   [lindex $parts 8]  \
       req_rel   [lindex $parts 9]  \
       type      [lindex $parts 10] \
       path      $path              \
       last_char [lindex $parts 12] \
       unused3   [lindex $parts 13] \
       unused4   [lindex $parts 14] \
       unused5   [lindex $parts 15] \
       unused6   [lindex $parts 16] \
   ]
}

# parse a gopher directory index for links

proc parse {filename} {
  set retval [list]

  set fh [open $filename]
  set text [read $fh]
  close $fh

  set text [string map [list "\r\n" "\n"] $text]
  set lines [split $text "\n"]
  foreach {line} $lines {
     set type [string index $line 0]
     set data [string range $line 1 end]
     set fields [split $data "\t"]
     lassign $fields label path server port
     if {$type eq "1"} {
         if {[string index $path end] ne "/"} {
             set path "$path/"
         }
     }
     if {$type eq "0" ||
         $type eq "1" ||
         $type eq "9" ||
         $type eq "I" ||
         $type eq "g"
     } {
         if {$port == 70} {
             set url [format "gopher://%s/%s%s" $server $type $path]
         } else {
             set url [format "gopher://%s:%d/%s%s" $server $port $type $path]
         }
         lappend retval $url
     } elseif {$type eq "h" || $type eq "i"} {
         # skip "h" and "i" types
         continue
     } elseif {$type eq "."} {
         break
     } else {
         puts "Unknown gopher type $type in $filename"
         continue
     }
  }
  return $retval
}

proc main {argv} {
   set ch [curl::init]

   set binary "error"
   set ignore [list]
   set images "save"
   set show_skipped false
   set url ""
   while {[llength $argv] > 0} {
       set remainder [lassign $argv arg]
       set argv $remainder
       if {$arg eq "--binary-skip"} {
           set binary "skip"
       } elseif {$arg eq "--binary-save"} {
           set binary "save"
       } elseif {$arg eq "--ignore"} {
           set remainder [lassign $argv prefix]
           set argv $remainder
           lappend ignore $prefix
       } elseif {$arg eq "--images-skip"} {
           set images "skip"
       } elseif {$arg eq "--images-save"} {
           set images "save"
       } elseif {$arg eq "--show-skipped"} {
           set show_skipped true
       } elseif {[regexp {^--} $arg]} {
           puts "\nUnknown option $arg\n"
           break
       } else {
           set url $arg
           break
       }
   }

   if {[string length $url] == 0} {
       puts {Usage: gopher-get.tcl [options] gopher-URI}
       puts ""
       puts "Options:"
       puts "--binary-skip (Skip item type 9 links)"
       puts "--binary-save (Save item type 9 links)"
       puts "--ignore <prefix> (Ignore links beginning with prefix"
       puts "--images-skip (Skip item type g and I links)"
       puts "--images-save (Save item type g and I links)"
       puts "--show-skipped (Report skipped links)"
       puts ""
       exit
   }

   set parts [gopher_uri_split $url]
   set parsed [dict create {*}$parts]
   set host [dict get $parsed fqdn]
   set path [dict get $parsed path]
   set path_len [string length $path]
   if {$path_len > 0} {
       incr path_len -1
   }

   set files_done [dict create]
   set uris [list $parsed]
   set urls_done [dict create]

   while {[llength $uris] > 0} {
       # for each url in the list, fetch a file
       # keep track of type 1 index files
       set files [list]
       foreach {uri} $uris {
           set type [dict get $uri type]
           set url [dict get $uri url]
           dict set urls_done $url 1
           set filename [fetch $ch $url]
           if {$type eq "1" &&
               [llength $filename] > 0 &&
               ![dict exists $files_done $filename]
           } {
               lappend files $filename
           }
       }

       # parse each type 1 index file for new urls
       set uris [list]
       foreach {filename} $files {
           dict set files_done $filename 1
           set links [parse $filename]
           foreach {link} $links {
               set ignored false
               foreach {prefix} $ignore {
                   set pattern $prefix
                   append pattern "*"
                   if {[string match $pattern $link]} {
                       if {$show_skipped} {
                           puts "Skipped link $link, ignore $prefix"
                       }
                       set ignored true
                       break
                   }
               }
               if {$ignored} {
                   continue
               }
               set parts [gopher_uri_split $link]
               set parsed [dict create {*}$parts]
               set link_host [dict get $parsed fqdn]
               set link_path [dict get $parsed path]
               set link_type [dict get $parsed type]
               set start [string range $link_path 0 $path_len]
               if {$link_host ne $host} {
                   if {$show_skipped} {
                       puts "Skipped link $link, $link_host != $host"
                   }
                   continue
               }
               if {$start ne $path} {
                   if {$show_skipped} {
                       puts "Skipped link $link, $start != $path"
                   }
                   continue
               }
               if {[dict exists $urls_done $link]} {
                   if {$show_skipped} {
                       puts "Skipped link $link, link already flagged done"
                   }
                   continue
               }
               if {$link_type eq "9"} {
                   if {$binary eq "error"} {
                       puts "Error: item type 9 (binary file)"
                       exit
                   } elseif {$binary eq "skip"} {
                       if {$show_skipped} {
                           puts "Skipped link $link, item type = 9"
                       }
                       continue
                   } else {
                       # save it
                   }
               } elseif {$link_type eq "g" || $link_type == "I"} {
                   if {$images eq "skip"} {
                       if {$show_skipped} {
                           puts "Skipped image $link, item type = $link_type"
                       }
                       continue
                   } else {
                       # save it
                   }
               }
               lappend uris $parsed
           }
       }

   }

   curl_exit $ch 0
}

main $::argv