#!/usr/bin/awk -f
# gopher-validate.awk Version 5
#
# Validate gopher directory or text
#
# Relevant RFCs:
# gopher://gopher.fnord.one/0/Mirrors/RFC/rfc1436.txt
# gopher://gopher.fnord.one/0/Mirrors/RFC/rfc8962.txt
#
# See also:
# https://tildegit.org/sloum/mapcheck/src/branch/master/mapcheck
# https://codemadness.org/git/gopher-validator/file/gopher-validator.c.html
#
# This script requires:
# - /bin/sh
# - curl
# - iconv
# - /usr/bin/awk
# - /usr/bin/mktemp

function mktemp() {
   cmd = "/usr/bin/mktemp"
   while ((cmd | getline) > 0) {
       retval = $0
   }
   result = close(cmd)
   if (result != 0) {
       print "Error: mktemp failed exit status: " result
       exit
   }
   if (length(retval) == 0) {
       print "Error: mktemp failed, no tmpfile"
       exit
   }
   return retval
}

function unlink(name) {
   system("rm " name)
   return
}

function validate_encoding(iconv, encoding, file) {
   cmd = sprintf("%s -f %s -t %s <%s >/dev/null 2>/dev/null",
       iconv, encoding, encoding, file)
   result = system(cmd)
   if (result == 0) {
       retval = 1
   } else {
       retval = 0
   }
   return retval
}

function validate_gophermap(file) {
   # valid gopher item types
   types["0"] = 0
   types["1"] = 0
   types["2"] = 0
   types["3"] = 0
   types["4"] = 0
   types["5"] = 0
   types["6"] = 0
   types["7"] = 0
   types["8"] = 0
   types["9"] = 0
   types["I"] = 0
   types["M"] = 0
   types["P"] = 0
   types["T"] = 0
   types["c"] = 0
   types["e"] = 0
   types["g"] = 0
   types["h"] = 0
   types["i"] = 0
   types["s"] = 0
   types["v"] = 0
   types["+"] = 0
   types[";"] = 0
   types["!"] = 0
   types["."] = 0

   FS = "\t"
   ilines = 0
   iline_warning_seen = 0
   lineno = 0
   seen_end = 0
   retval = 1
   while (getline <file) {
       lineno++
       if (/\r$/) {
           $0 = substr($0, 1, length($0) - 1)
       } else {
           printf "Error: Missing CR character on line %d\n\n", lineno
           print "RFC1436 Section 2:"
           print "    A CR LF denotes the end of the item."
           retval = 0
           break
       }
       if (seen_end) {
           print "Error: Unexpected trailing text after end of gophermap.\n"
           print "RFC1436 Introduction:"
           print "    The server responds with a block of text terminated"
           print "    by a period on a line by itself and closes the"
           print "    connection."
           retval = 0
           break
       }
       if (NF == 1 && $0 == ".") {
           seen_end = 1
           continue
       }
       if (NF == 4 || (NF == 5 && $5 == "+")) {
           # items with 4 fields are normal
           # gopher+ items are ignored
       } else {
           printf "Error: %d fields on line %d, expected 4\n\n", NF, lineno
           print "RFC1436 Introduction:"
           print "    Each item in a directory is identified by a"
           print "    [1] type... [and] user-visible name..."
           print "    [2] an opaque selector string..."
           print "    [3] a host name..."
           print "    [4] and an IP portnumber..."
           retval = 0
           break
       }
       if (length($1) > 71) {
           msg = "Error"
           if (warn_longlines) {
               msg = "Warning"
           }
           printf "%s: Long user display string on line %d\n\n", msg, lineno
           print "RFC1436 Section 3.9:"
           print "    ... user display string should be kept"
           print "    under 70 characters in length.\n"
           if (!warn_longlines) {
               retval = 0
               break
           }
       }
       Item_Type = substr($1, 1, 1)
       if (!(Item_Type in types)) {
           printf "Warning: Non-standard gophertype \"%s\" on line %d\n\n",
               Item_Type, lineno
           print "RFC1436 Section 3.8:"
           print "A list of defined item-type characters follows:"
           print "0, 1, 2, 3, 4, 5, 6, 7, 8, 9, +, T, g, I"
           print "UMN gopher object/GSgopherobj.h:"
           print "M, P, c, e, h, i, s, v, ;, !, .\n"
       }
       User_Name = substr($1, 2)
       if (length(User_Name) > 0 && User_Name !~ /^[[:print:]]+$/) {
           printf "Error: Non-printable characters on line %d\n\n", lineno
           print "RFC1436 Appendix:"
           print "    It is *highly* recommended that the User_Name field"
           print "    contain only printable characters, since many"
           print "    different clients will be using it."
           retval = 0
           break
       }
       if (/^i/) {
           ilines++
       } else {
           ilines = 0
       }
       if (ilines > 20 && !iline_warning_seen) {
           print "Warning: Over 20 consecutive info-lines"
           print "Gophermap may contain content rather than navigation."
           print "See:"
           print "gopher://gopher.icu/phlog/Computing/The-state-of-gopher.md"
           print ""
           iline_warning_seen = 1
       }
   }
   close(file)
   return retval
}

function validate_text(curl, iconv, uri) {
   # use curl to fetch gopher directory or text
   curlcfg = mktemp()
   curlout = mktemp()
   print "--max-filesize 256K"      > curlcfg
   print "--max-redirs 0"          >> curlcfg
   print "--output " curlout       >> curlcfg
   print "--proto =gopher,gophers" >> curlcfg
   print "--silent"                >> curlcfg
   print "--url " uri              >> curlcfg
   fflush(curlcfg)
   result = system(curl " -K " curlcfg)
   unlink(curlcfg)
   if (result != 0) {
       unlink(curlout)
       print "Error: Curl couldn't fetch URI"
       exit 1
   }

   # use iconv to validate the result character encoding
   valid_encodings[0] = "ASCII"
   valid_encodings[1] = "UTF-8"

   # All 8-bit characters represent valid ISO-8859-1 (Latin)
   # encoding so checking for invalid encoding is meaningless.
   # valid_encodings[2] = "ISO-8859-1"

   is_valid = 0
   for (i = 0; i < 2; i++) {
       encoding = valid_encodings[i]
       result = validate_encoding(iconv, encoding, curlout)
       if (result) {
           is_valid = 1
           break
       }
   }
   if (!is_valid) {
       print "Error: Invalid character encoding."
       print "Expected ASCII or UTF-8.\n"
       print "RFC1436 Section 4(b):"
       print "    The well-tempered server ought to send \"text\"..."
       unlink(curlout)
       exit 1
   }

   # validate gophermap if it is a directory
   if (type == "1") {
       result = validate_gophermap(curlout)
       if (result == 0) {
           unlink(curlout)
           exit 1
       }
   }

   print "Valid"
   unlink(curlout)
   exit 0
}

function validate_uri(uri) {
   if (uri !~ /^gophers?:\/\//) {
       print "Error: expected gopher: protocol URI"
       exit 1
   }
   result = match(uri, /^gophers?:\/\/[^\/]*\/(.)\//)
   if (result == 0) {
       type = ""
   } else {
       type = substr(uri, RLENGTH - 1, 1)
   }
   if (length(type) != 1) {
       print "Error: couldn't find item type in URI"
       exit 1
   }
   if (type != "0" && type != "1") {
      print "Error: Expected item type 0 or 1 in URI"
      exit 1
   }
   return type
}

BEGIN {
   if (ARGC < 2) {
       print "Usage: gopher-validate.awk -- [options] URI"
       print ""
       print "Options:"
       print "--warn-longlines  Warn instead of error on long lines"
       print ""
       exit 1
   }
   uri = ""
   warn_longlines = 0
   for (i = 1; i < ARGC; i++) {
       arg = ARGV[i]
       if (arg == "--warn-longlines") {
            warn_longlines = 1
       } else {
            if (length(uri) > 0) {
                print "Error: Unrecognized argument: " arg
                exit 1
            } else {
                uri = arg
            }
       }
   }
   type = validate_uri(uri)
   curl = "curl"
   result = system("command -v " curl " >/dev/null")
   if (result) {
       print "Error: Couldn't find command: " curl
       exit 1
   }
   iconv = "iconv"
   result = system("command -v " iconv " >/dev/null")
   if (result) {
       print "Error: Couldn't find command: " iconv
       exit 1
   }
   validate_text(curl, iconv, uri)
}