#![allow(clippy::trivial_regex)]

use regex::bytes::Regex;
use regex::bytes::RegexSet;

use std::fs;
use std::fs::File;
use std::io::prelude::*;
use std::path::Path;

use std::os::unix::fs::FileTypeExt;

use unicode_bom::Bom;

use anyhow::Result;

//
// File signatures links
// - https://asecuritysite.com/forensics/magic
// - https://filesignatures.net/
// - https://github.com/7h3rAm/cigma/blob/master/cigma/magicbytes.json

#[derive(Debug, PartialEq, Eq)]
pub enum LineEnding {
   Lf,
   Cr,
   Crlf,
   Mixed(usize, usize, usize),
}

#[derive(Debug, PartialEq, Eq)]
pub enum Mimetype {
   Binary,
   Script(LineEnding),
   Pdf,
   Archive,
   Zip,
   Text(LineEnding),
   Data,
   Unknown,
   BlockDevice,
   CharDevice,
   Directory,
   Symlink,
   Fifo,
   Socket,
   Zerofile,
   VeryShort,
   Bom(Bom),
}

pub struct Filetype {
   buffer: Vec<u8>,
}

fn is_binary_data(vec: &[u8], len: usize) -> bool {
   for v in vec.iter().take(len) {
       if *v <= 8 {
           return true;
       }
   }

   false
}

fn _is_crlf(buffer: &[u8], len: usize) -> bool {
   let mut cr = 0;
   let mut lf = 0;

   const CR: u8 = 0x0d; // 13
   const LF: u8 = 0x0a; // 10

   for c in buffer.iter().take(len) {
       if *c == LF {
           lf += 1;
       } else if *c == CR {
           cr += 1;
       }
   }

   let diff: i32 = cr - lf;
   if cr > 0 && diff == 0 {
       return true;
   }

   //println!("cr: {}, lf: {}", cr, lf);
   // Heuristics: we accept if only a few lines are not Crlf
   match (cr, lf) {
       (0, _lf) => return false,
       (_cr, 0) => return true,
       (cr, _lf) => {
           if cr > 500 && diff.abs() < 3 {
               return true;
           }
       }
   }

   false
}

fn is_crlf(buffer: &[u8], len: usize) -> LineEnding {
   let mut seen_cr = false;
   let mut n_crlf = 0;
   let mut n_lf = 0;
   let mut n_cr = 0;

   const CR: u8 = 0x0d; // CR 0x0D 13 \r
   const LF: u8 = 0x0a; // LF 0x0A 10 \n

   for c in buffer.iter().take(len) {
       if *c == LF {
           if seen_cr {
               n_crlf += 1;
           } else {
               n_lf += 1;
           }
       } else if seen_cr {
           n_cr += 1;
       }

       seen_cr = *c == CR;
   }

   // println!("Lf / Cr / Crlf: {} / {} / {}", n_lf, n_cr, n_crlf);

   //    println!("cr: {}, lf: {}, crlf: {}", n_cr, n_lf, n_crlf);
   //  if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0)
   //  --> no line terminators

   match (n_cr, n_lf, n_crlf) {
       (0, 0, z) if z > 0 => LineEnding::Crlf,
       (x, 0, 0) if x > 0 => LineEnding::Cr,
       (0, y, 0) if y > 0 => LineEnding::Lf,
       (x, y, z) => LineEnding::Mixed(x, y, z),
   }
}

impl Filetype {
   pub fn new() -> Self {
       Filetype {
           buffer: vec![0; 1024 * 1024],
       }
   }

   pub fn analyze(&mut self, fname: &str) -> Result<Mimetype> {
       // Result<Err,Mimetype> {
       let path = Path::new(fname);

       if let Some(ft) = get_filetype(path) {
           return Ok(ft);
       }

       let metadata = fs::symlink_metadata(fname)?;
       let file_length: usize = metadata.len() as usize;

       if file_length == 0 {
           return Ok(Mimetype::Zerofile);
       }

       if metadata.len() == 1 {
           return Ok(Mimetype::VeryShort);
       }

       let mut hdl_in = File::open(path)?;

       let mut bytes_read: usize = hdl_in.read(&mut self.buffer[0..262])?;

       // PostScript signatures
       // - %!PS-Adobe-1.0, %!PS-Adobe-2.0, %!PS-Adobe-3.0, %!PS-Adobe-3.1
       // - %! and a line feed
       if bytes_read >= 4 && &self.buffer[0..4] == b"%!PS" {
           return Ok(Mimetype::Data);
       }

       // - %!\r\n%%BoundingBox:
       let re: Regex = Regex::new(r"^(?-u)%!(\x0d\x0a|\x0A)%%BoundingBox").unwrap();
       if bytes_read >= 20 && re.is_match(&self.buffer) {
           return Ok(Mimetype::Data);
       }

       if bytes_read >= 4 && &self.buffer[0..4] == b"%PDF" {
           return Ok(Mimetype::Pdf);
       }

       // rtf document
       if bytes_read >= 6 && &self.buffer[0..6] == b"\x7B\x5C\x72\x74\x66\x31" {
           return Ok(Mimetype::Data);
       }

       // ZOO archive  http://fileformats.archiveteam.org/wiki/ZOO
       if bytes_read >= 60 && &self.buffer[20..24] == b"\xDC\xA7\xC4\xFD" {
           return Ok(Mimetype::Archive);
       }

       let bom: Bom = Bom::from(&self.buffer[0..]);

       if bom != Bom::Null {
           return Ok(Mimetype::Bom(bom));
       }

       if is_binary_data(&self.buffer, bytes_read) {
           match analyze_binary(&self.buffer) {
               Some(Mimetype::Zip) => {
                   if fname.ends_with(".cdy") {
                       return Ok(Mimetype::Data);
                   } else {
                       return Ok(Mimetype::Zip);
                   }
               }
               Some(mt) => return Ok(mt),
               None => return Ok(Mimetype::Unknown),
           }
       }

       // https://en.wikipedia.org/wiki/BinHex
       if bytes_read >= 200
           && self
               .buffer
               .starts_with(b"(This file must be converted with BinHex 4.0)")
       {
           return Ok(Mimetype::Binary);
       }

       if bytes_read < file_length {
           if let Ok(rb) = hdl_in.read(&mut self.buffer[262..]) {
               bytes_read += rb
           }
       }

       //println!("Filename: {}", fname);

       let crlf = is_crlf(&self.buffer, bytes_read);
       //println!("{:?}", crlf);

       // checks for
       // - shebang which either starts with `!# ` or `!#/`
       // - php indicator
       if bytes_read >= 5
           && (self.buffer.starts_with(b"#! ")
               || self.buffer.starts_with(b"#!/")
               || self.buffer.starts_with(b"<?php"))
       {
           return Ok(Mimetype::Script(crlf));
       }

       Ok(Mimetype::Text(crlf))
       // match (crlf, is_script) {
       //     (LineEnding::Lf, false) => Ok(Mimetype::Text(LineEnding::Lf)),
       //     (LineEnding::Cr, false) => Ok(Mimetype::Text(LineEnding::Cr)),
       //     (LineEnding::Crlf, false) => Ok(Mimetype::Text(LineEnding::Crlf)),
       //     (LineEnding::Lf, true) => Ok(Mimetype::Script(LineEnding::Lf)),
       //     (LineEnding::Cr, true) => Ok(Mimetype::Script(LineEnding::Cr)),
       //     (LineEnding::Crlf, true) => Ok(Mimetype::Script(LineEnding::Crlf)),
       //     (_, _) => Ok(Mimetype::Text(LineEnding::Lf)),
       // }
   }
}

// https://en.wikipedia.org/wiki/Executable_and_Linkable_Format
// https://en.wikipedia.org/wiki/Mach-O
fn is_binary(vec: &[u8]) -> Option<Mimetype> {
   let binary_re: RegexSet = RegexSet::new([
       r"^(?-u)\x7FELF[\x01\x02][\x01\x02]\x01[\x00-\x11]", // Executable and Linkable Format (ELF)
       r"^(?-u)\x00\x00\x03\xF3", // AmigaOS loadseg()ble executable/binary
       r"^(?-u)MZ", // DOS MZ executable file format and its descendants (including NE and PE)
       r"^(?-u)\x64 \x65\x78\x0A\x30\x33\x35\x00", // Dalvik's executable
       r"^(?-u)#[!]", // script executable
       r"^(?-u)\xCA\xFE\xBA\xBE", // Mach-O binary universal header
       // \xCE\xFA\xED\xFE or  \xCF)\xFA\xED\xFE
       r"^(?-u)(\xCE|\xCF)\xFA\xED\xFE", // Mach-O binary
       r"^(?-u)\x1B\x4C\x75\x61",        // Lua bytecode
   ])
   .unwrap();

   if binary_re.is_match(vec) {
       return Some(Mimetype::Binary);
   }
   None
}

// https://github.com/7h3rAm/cigma/blob/master/cigma/magicbytes.json
// https://en.wikipedia.org/wiki/List_of_file_signatures
fn is_archive(vec: &[u8]) -> Option<Mimetype> {
   // we first have to catch zip files with mimetype formats
   //  - opendocument formats
   //  - Word Open XML
   // Those we do not regard as archives
   let special_zip: RegexSet = RegexSet::new([
       r"^(?-u)PK\x03\x04.{20,}\x08\x00\x00\x00mimetypeapplication",
       r"^(?-u)PK\x03\x04\x14\x00\x06\x00", // Word Open XML (.docx)
       r"^(?-u)PK\x03\x04\x14\x00\x08\x00", // Java Jar file
       r"^(?-u)PK\x03\x04\x14\x00\x08\x08", // Java Jar file
       r"^(?-u)PK\x03\x04\x0A.*?META-INF",  // Java Jar file
       r"^(?-u)PK\x03\x04.*?META-INF",      // Java Jar file
       r"^(?-u)PK\x03\x04\x0A.*?\x56\x92\x48\x4F\xEF", // Java Jar file
   ])
   .unwrap();

   if special_zip.is_match(vec) {
       return Some(Mimetype::Data);
   }

   let archive_re: RegexSet = RegexSet::new([
       r"^(?-u)\x37\x7A\xBC\xAF\x27\x1C",     // 7zip
       r"^(?-u)\x1f\x8B",                     // gzip (.gz)
       r"^(?-u)\x1f\x9D",                     // LZW (.tar.Z)
       r"^(?-u)\x1f\xA0",                     // LZH (.tar.Z)
       r"^(?-u)\xFD\x37\x7A\x58\x5A\x00\x00", // XZ comp. utility using LZMA2 compression (.xz)
       r"^(?-u)\x4D\x53\x43\x46",             // Microsoft cabinet (.cab)
       r"^(?-u)\x42\x5A\x68",                 // bzip2
       r"^(?-u)\x5A\x57\x53",                 // lzma
       r"^(?-u)\x5D\x00\x00(\x01|\x02|\x04|\x08|\x10|\x20|\x40|\x80)\x00", // lzma
       r"^(?-u)\x5D\x00\x00\x00\x01",         // lzma
       r"^(?-u)(SIT!|SITD|STi0|StuffIt)",     // SIT / stuffit (macintosh related)
       r"^(?-u)\x4D\x5A", // DOS MZ executable format, but found in zip archives
       r"^(?-u)\x52\x61\x72\x21\x1A\x07\x00", // RAR archive version 1.50 onwards
       r"^(?-u)\x52\x61\x72\x21\x1A\x07\x01\x00", // RAR archive version 5.0 onwards
       // https://en.wikipedia.org/wiki/LHA_(file_format)
       r"^(?-u)..-lh[0124567d]",                  // LHarc (canonical LZH)
       r"^(?-u)..-lh[89abce]",                    // LHarc (Joe Jared extensions)
       r"^(?-u)..-lhx",                           // LHarc (UNLHA32 extensions)
       r"^(?-u)..-(pc1|pm0|pm1|pm2|pms)",         // LHarc (PMarc extensions)
       r"^(?-u)..-lz[s234578]",                   // LHarc (LArc extensions)
       r"^(?-u)\x53\x5a\x44\x44\x88\xf0\x27\x33", // RAR archive version 5.0 onwards
   ])
   .unwrap();

   if archive_re.is_match(vec) {
       return Some(Mimetype::Archive);
   }

   let archive_re: RegexSet = RegexSet::new([
       r"^(?-u)PK(\x03\x04|\x4c\x49\x54\x45|\x30\x30\x50|\x05\x06|\x07\x08)", // zip archive
   ])
   .unwrap();
   if archive_re.is_match(vec) {
       return Some(Mimetype::Zip);
   }

   None
}

fn analyze_binary(vec: &[u8]) -> Option<Mimetype> {
   let rc = is_binary(vec);
   if rc.is_some() {
       return rc;
   }

   let rc = is_archive(vec);
   if rc.is_some() {
       return rc;
   }

   Some(Mimetype::Data)
}

fn get_filetype(entry: &Path) -> Option<Mimetype> {
   match entry.symlink_metadata() {
       Ok(mt) => {
           let ft = mt.file_type();
           if ft.is_symlink() {
               return Some(Mimetype::Symlink);
           }
           if ft.is_dir() {
               return Some(Mimetype::Directory);
           }
           if ft.is_block_device() {
               return Some(Mimetype::BlockDevice);
           }
           if ft.is_char_device() {
               return Some(Mimetype::CharDevice);
           }
           if ft.is_fifo() {
               return Some(Mimetype::Fifo);
           }
           if ft.is_socket() {
               return Some(Mimetype::Socket);
           }
           None
       }
       Err(_e) => None,
   }
}

#[test]
fn test_filetype() {
   let mut ft = Filetype::new();

   assert!(ft.analyze("tests_filemagic/zerofile").ok() == Some(Mimetype::Zerofile));
   assert!(ft.analyze("tests_filemagic/a_small_file").ok() == Some(Mimetype::VeryShort));
   assert!(ft.analyze("/dev/null").ok() == Some(Mimetype::CharDevice));
   assert!(ft.analyze("tests_filemagic/").ok() == Some(Mimetype::Directory));
   assert!(ft.analyze("tests_filemagic/zerofile_symlink").ok() == Some(Mimetype::Symlink));

   assert!(ft.analyze("tests_filemagic/some.pdf").ok() == Some(Mimetype::Pdf));

   // This file is a pdf but has lines starting with % before the pdf signature shows up
   // The unix `file` command) says: data
   // analyze() says TextCrlf
   //assert!(ft.analyze("tests_filemagic/musterlogo.pdf").ok()           == Some(Mimetype::Script));

   assert!(ft.analyze("tests_filemagic/x.pl").ok() == Some(Mimetype::Script(LineEnding::Lf)));
   assert!(ft.analyze("tests_filemagic/main.php").ok() == Some(Mimetype::Script(LineEnding::Lf)));

   assert!(ft.analyze("tests_filemagic/test.7z").ok() == Some(Mimetype::Archive));
   assert!(ft.analyze("tests_filemagic/x.tgz").ok() == Some(Mimetype::Archive));
   assert!(ft.analyze("tests_filemagic/test.pdf.xz").ok() == Some(Mimetype::Archive));
   assert!(ft.analyze("tests_filemagic/swebib.cab").ok() == Some(Mimetype::Archive));
   assert!(ft.analyze("tests_filemagic/test.tar.bz2").ok() == Some(Mimetype::Archive));
   assert!(ft.analyze("tests_filemagic/PIE.rar").ok() == Some(Mimetype::Archive));
   assert!(ft.analyze("tests_filemagic/infozip-os390.tar.Z").ok() == Some(Mimetype::Archive));
   assert!(ft.analyze("tests_filemagic/bla.lha").ok() == Some(Mimetype::Archive));
   assert!(ft.analyze("tests_filemagic/dvi.zoo").ok() == Some(Mimetype::Archive));
   assert!(ft.analyze("tests_filemagic/rsfs-oztex.sit").ok() == Some(Mimetype::Archive));

   assert!(ft.analyze("tests_filemagic/empty.zip").ok() == Some(Mimetype::Zip));

   assert!(
       ft.analyze("tests_filemagic/README").ok()
           == Some(Mimetype::Text(LineEnding::Mixed(0, 0, 0)))
   );
   //    assert!(ft.analyze("tests_filemagic/README1").ok()                  == Some(Mimetype::Text));

   assert!(ft.analyze("tests_filemagic/cp").ok() == Some(Mimetype::Binary));
   assert!(ft.analyze("tests_filemagic/cheq-f.sit-hqx").ok() == Some(Mimetype::Binary));
   assert!(ft.analyze("tests_filemagic/MuchMore").ok() == Some(Mimetype::Binary));

   assert!(ft.analyze("tests_filemagic/support.ps").ok() == Some(Mimetype::Data));
   assert!(ft.analyze("tests_filemagic/rosette.eps").ok() == Some(Mimetype::Data));
   assert!(ft.analyze("tests_filemagic/eutest.ps").ok() == Some(Mimetype::Data));
   //    assert!(ft.analyze("tests_filemagic/NORMAL.PS").ok()                == Some(Mimetype::Data));
   assert!(ft.analyze("tests_filemagic/chap5.rtf").ok() == Some(Mimetype::Data));
   assert!(ft.analyze("tests_filemagic/commons-math.jar").ok() == Some(Mimetype::Data));

   assert!(
       ft.analyze("tests_filemagic/8stbu11h.htm").ok()
           == Some(Mimetype::Text(LineEnding::Mixed(0, 1, 8710)))
   );
}