#![allow(clippy::trivial_regex)]
use regex::bytes::Regex;
use regex::bytes::RegexSet;
use std::fs;
use std::fs::File;
use std::io::prelude::*;
use std::path::Path;
use std::os::unix::fs::FileTypeExt;
use unicode_bom::Bom;
use anyhow::Result;
//
// File signatures links
// -
https://asecuritysite.com/forensics/magic
// -
https://filesignatures.net/
// -
https://github.com/7h3rAm/cigma/blob/master/cigma/magicbytes.json
#[derive(Debug, PartialEq, Eq)]
pub enum LineEnding {
Lf,
Cr,
Crlf,
Mixed(usize, usize, usize),
}
#[derive(Debug, PartialEq, Eq)]
pub enum Mimetype {
Binary,
Script(LineEnding),
Pdf,
Archive,
Zip,
Text(LineEnding),
Data,
Unknown,
BlockDevice,
CharDevice,
Directory,
Symlink,
Fifo,
Socket,
Zerofile,
VeryShort,
Bom(Bom),
}
pub struct Filetype {
buffer: Vec<u8>,
}
fn is_binary_data(vec: &[u8], len: usize) -> bool {
for v in vec.iter().take(len) {
if *v <= 8 {
return true;
}
}
false
}
fn _is_crlf(buffer: &[u8], len: usize) -> bool {
let mut cr = 0;
let mut lf = 0;
const CR: u8 = 0x0d; // 13
const LF: u8 = 0x0a; // 10
for c in buffer.iter().take(len) {
if *c == LF {
lf += 1;
} else if *c == CR {
cr += 1;
}
}
let diff: i32 = cr - lf;
if cr > 0 && diff == 0 {
return true;
}
//println!("cr: {}, lf: {}", cr, lf);
// Heuristics: we accept if only a few lines are not Crlf
match (cr, lf) {
(0, _lf) => return false,
(_cr, 0) => return true,
(cr, _lf) => {
if cr > 500 && diff.abs() < 3 {
return true;
}
}
}
false
}
fn is_crlf(buffer: &[u8], len: usize) -> LineEnding {
let mut seen_cr = false;
let mut n_crlf = 0;
let mut n_lf = 0;
let mut n_cr = 0;
const CR: u8 = 0x0d; // CR 0x0D 13 \r
const LF: u8 = 0x0a; // LF 0x0A 10 \n
for c in buffer.iter().take(len) {
if *c == LF {
if seen_cr {
n_crlf += 1;
} else {
n_lf += 1;
}
} else if seen_cr {
n_cr += 1;
}
seen_cr = *c == CR;
}
// println!("Lf / Cr / Crlf: {} / {} / {}", n_lf, n_cr, n_crlf);
// println!("cr: {}, lf: {}, crlf: {}", n_cr, n_lf, n_crlf);
// if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0)
// --> no line terminators
match (n_cr, n_lf, n_crlf) {
(0, 0, z) if z > 0 => LineEnding::Crlf,
(x, 0, 0) if x > 0 => LineEnding::Cr,
(0, y, 0) if y > 0 => LineEnding::Lf,
(x, y, z) => LineEnding::Mixed(x, y, z),
}
}
impl Filetype {
pub fn new() -> Self {
Filetype {
buffer: vec![0; 1024 * 1024],
}
}
pub fn analyze(&mut self, fname: &str) -> Result<Mimetype> {
// Result<Err,Mimetype> {
let path = Path::new(fname);
if let Some(ft) = get_filetype(path) {
return Ok(ft);
}
let metadata = fs::symlink_metadata(fname)?;
let file_length: usize = metadata.len() as usize;
if file_length == 0 {
return Ok(Mimetype::Zerofile);
}
if metadata.len() == 1 {
return Ok(Mimetype::VeryShort);
}
let mut hdl_in = File::open(path)?;
let mut bytes_read: usize = hdl_in.read(&mut self.buffer[0..262])?;
// PostScript signatures
// - %!PS-Adobe-1.0, %!PS-Adobe-2.0, %!PS-Adobe-3.0, %!PS-Adobe-3.1
// - %! and a line feed
if bytes_read >= 4 && &self.buffer[0..4] == b"%!PS" {
return Ok(Mimetype::Data);
}
// - %!\r\n%%BoundingBox:
let re: Regex = Regex::new(r"^(?-u)%!(\x0d\x0a|\x0A)%%BoundingBox").unwrap();
if bytes_read >= 20 && re.is_match(&self.buffer) {
return Ok(Mimetype::Data);
}
if bytes_read >= 4 && &self.buffer[0..4] == b"%PDF" {
return Ok(Mimetype::Pdf);
}
// rtf document
if bytes_read >= 6 && &self.buffer[0..6] == b"\x7B\x5C\x72\x74\x66\x31" {
return Ok(Mimetype::Data);
}
// ZOO archive
http://fileformats.archiveteam.org/wiki/ZOO
if bytes_read >= 60 && &self.buffer[20..24] == b"\xDC\xA7\xC4\xFD" {
return Ok(Mimetype::Archive);
}
let bom: Bom = Bom::from(&self.buffer[0..]);
if bom != Bom::Null {
return Ok(Mimetype::Bom(bom));
}
if is_binary_data(&self.buffer, bytes_read) {
match analyze_binary(&self.buffer) {
Some(Mimetype::Zip) => {
if fname.ends_with(".cdy") {
return Ok(Mimetype::Data);
} else {
return Ok(Mimetype::Zip);
}
}
Some(mt) => return Ok(mt),
None => return Ok(Mimetype::Unknown),
}
}
//
https://en.wikipedia.org/wiki/BinHex
if bytes_read >= 200
&& self
.buffer
.starts_with(b"(This file must be converted with BinHex 4.0)")
{
return Ok(Mimetype::Binary);
}
if bytes_read < file_length {
if let Ok(rb) = hdl_in.read(&mut self.buffer[262..]) {
bytes_read += rb
}
}
//println!("Filename: {}", fname);
let crlf = is_crlf(&self.buffer, bytes_read);
//println!("{:?}", crlf);
// checks for
// - shebang which either starts with `!# ` or `!#/`
// - php indicator
if bytes_read >= 5
&& (self.buffer.starts_with(b"#! ")
|| self.buffer.starts_with(b"#!/")
|| self.buffer.starts_with(b"<?php"))
{
return Ok(Mimetype::Script(crlf));
}
Ok(Mimetype::Text(crlf))
// match (crlf, is_script) {
// (LineEnding::Lf, false) => Ok(Mimetype::Text(LineEnding::Lf)),
// (LineEnding::Cr, false) => Ok(Mimetype::Text(LineEnding::Cr)),
// (LineEnding::Crlf, false) => Ok(Mimetype::Text(LineEnding::Crlf)),
// (LineEnding::Lf, true) => Ok(Mimetype::Script(LineEnding::Lf)),
// (LineEnding::Cr, true) => Ok(Mimetype::Script(LineEnding::Cr)),
// (LineEnding::Crlf, true) => Ok(Mimetype::Script(LineEnding::Crlf)),
// (_, _) => Ok(Mimetype::Text(LineEnding::Lf)),
// }
}
}
//
https://en.wikipedia.org/wiki/Executable_and_Linkable_Format
//
https://en.wikipedia.org/wiki/Mach-O
fn is_binary(vec: &[u8]) -> Option<Mimetype> {
let binary_re: RegexSet = RegexSet::new([
r"^(?-u)\x7FELF[\x01\x02][\x01\x02]\x01[\x00-\x11]", // Executable and Linkable Format (ELF)
r"^(?-u)\x00\x00\x03\xF3", // AmigaOS loadseg()ble executable/binary
r"^(?-u)MZ", // DOS MZ executable file format and its descendants (including NE and PE)
r"^(?-u)\x64 \x65\x78\x0A\x30\x33\x35\x00", // Dalvik's executable
r"^(?-u)#[!]", // script executable
r"^(?-u)\xCA\xFE\xBA\xBE", // Mach-O binary universal header
// \xCE\xFA\xED\xFE or \xCF)\xFA\xED\xFE
r"^(?-u)(\xCE|\xCF)\xFA\xED\xFE", // Mach-O binary
r"^(?-u)\x1B\x4C\x75\x61", // Lua bytecode
])
.unwrap();
if binary_re.is_match(vec) {
return Some(Mimetype::Binary);
}
None
}
//
https://github.com/7h3rAm/cigma/blob/master/cigma/magicbytes.json
//
https://en.wikipedia.org/wiki/List_of_file_signatures
fn is_archive(vec: &[u8]) -> Option<Mimetype> {
// we first have to catch zip files with mimetype formats
// - opendocument formats
// - Word Open XML
// Those we do not regard as archives
let special_zip: RegexSet = RegexSet::new([
r"^(?-u)PK\x03\x04.{20,}\x08\x00\x00\x00mimetypeapplication",
r"^(?-u)PK\x03\x04\x14\x00\x06\x00", // Word Open XML (.docx)
r"^(?-u)PK\x03\x04\x14\x00\x08\x00", // Java Jar file
r"^(?-u)PK\x03\x04\x14\x00\x08\x08", // Java Jar file
r"^(?-u)PK\x03\x04\x0A.*?META-INF", // Java Jar file
r"^(?-u)PK\x03\x04.*?META-INF", // Java Jar file
r"^(?-u)PK\x03\x04\x0A.*?\x56\x92\x48\x4F\xEF", // Java Jar file
])
.unwrap();
if special_zip.is_match(vec) {
return Some(Mimetype::Data);
}
let archive_re: RegexSet = RegexSet::new([
r"^(?-u)\x37\x7A\xBC\xAF\x27\x1C", // 7zip
r"^(?-u)\x1f\x8B", // gzip (.gz)
r"^(?-u)\x1f\x9D", // LZW (.tar.Z)
r"^(?-u)\x1f\xA0", // LZH (.tar.Z)
r"^(?-u)\xFD\x37\x7A\x58\x5A\x00\x00", // XZ comp. utility using LZMA2 compression (.xz)
r"^(?-u)\x4D\x53\x43\x46", // Microsoft cabinet (.cab)
r"^(?-u)\x42\x5A\x68", // bzip2
r"^(?-u)\x5A\x57\x53", // lzma
r"^(?-u)\x5D\x00\x00(\x01|\x02|\x04|\x08|\x10|\x20|\x40|\x80)\x00", // lzma
r"^(?-u)\x5D\x00\x00\x00\x01", // lzma
r"^(?-u)(SIT!|SITD|STi0|StuffIt)", // SIT / stuffit (macintosh related)
r"^(?-u)\x4D\x5A", // DOS MZ executable format, but found in zip archives
r"^(?-u)\x52\x61\x72\x21\x1A\x07\x00", // RAR archive version 1.50 onwards
r"^(?-u)\x52\x61\x72\x21\x1A\x07\x01\x00", // RAR archive version 5.0 onwards
//
https://en.wikipedia.org/wiki/LHA_(file_format)
r"^(?-u)..-lh[0124567d]", // LHarc (canonical LZH)
r"^(?-u)..-lh[89abce]", // LHarc (Joe Jared extensions)
r"^(?-u)..-lhx", // LHarc (UNLHA32 extensions)
r"^(?-u)..-(pc1|pm0|pm1|pm2|pms)", // LHarc (PMarc extensions)
r"^(?-u)..-lz[s234578]", // LHarc (LArc extensions)
r"^(?-u)\x53\x5a\x44\x44\x88\xf0\x27\x33", // RAR archive version 5.0 onwards
])
.unwrap();
if archive_re.is_match(vec) {
return Some(Mimetype::Archive);
}
let archive_re: RegexSet = RegexSet::new([
r"^(?-u)PK(\x03\x04|\x4c\x49\x54\x45|\x30\x30\x50|\x05\x06|\x07\x08)", // zip archive
])
.unwrap();
if archive_re.is_match(vec) {
return Some(Mimetype::Zip);
}
None
}
fn analyze_binary(vec: &[u8]) -> Option<Mimetype> {
let rc = is_binary(vec);
if rc.is_some() {
return rc;
}
let rc = is_archive(vec);
if rc.is_some() {
return rc;
}
Some(Mimetype::Data)
}
fn get_filetype(entry: &Path) -> Option<Mimetype> {
match entry.symlink_metadata() {
Ok(mt) => {
let ft = mt.file_type();
if ft.is_symlink() {
return Some(Mimetype::Symlink);
}
if ft.is_dir() {
return Some(Mimetype::Directory);
}
if ft.is_block_device() {
return Some(Mimetype::BlockDevice);
}
if ft.is_char_device() {
return Some(Mimetype::CharDevice);
}
if ft.is_fifo() {
return Some(Mimetype::Fifo);
}
if ft.is_socket() {
return Some(Mimetype::Socket);
}
None
}
Err(_e) => None,
}
}
#[test]
fn test_filetype() {
let mut ft = Filetype::new();
assert!(ft.analyze("tests_filemagic/zerofile").ok() == Some(Mimetype::Zerofile));
assert!(ft.analyze("tests_filemagic/a_small_file").ok() == Some(Mimetype::VeryShort));
assert!(ft.analyze("/dev/null").ok() == Some(Mimetype::CharDevice));
assert!(ft.analyze("tests_filemagic/").ok() == Some(Mimetype::Directory));
assert!(ft.analyze("tests_filemagic/zerofile_symlink").ok() == Some(Mimetype::Symlink));
assert!(ft.analyze("tests_filemagic/some.pdf").ok() == Some(Mimetype::Pdf));
// This file is a pdf but has lines starting with % before the pdf signature shows up
// The unix `file` command) says: data
// analyze() says TextCrlf
//assert!(ft.analyze("tests_filemagic/musterlogo.pdf").ok() == Some(Mimetype::Script));
assert!(ft.analyze("tests_filemagic/x.pl").ok() == Some(Mimetype::Script(LineEnding::Lf)));
assert!(ft.analyze("tests_filemagic/main.php").ok() == Some(Mimetype::Script(LineEnding::Lf)));
assert!(ft.analyze("tests_filemagic/test.7z").ok() == Some(Mimetype::Archive));
assert!(ft.analyze("tests_filemagic/x.tgz").ok() == Some(Mimetype::Archive));
assert!(ft.analyze("tests_filemagic/test.pdf.xz").ok() == Some(Mimetype::Archive));
assert!(ft.analyze("tests_filemagic/swebib.cab").ok() == Some(Mimetype::Archive));
assert!(ft.analyze("tests_filemagic/test.tar.bz2").ok() == Some(Mimetype::Archive));
assert!(ft.analyze("tests_filemagic/PIE.rar").ok() == Some(Mimetype::Archive));
assert!(ft.analyze("tests_filemagic/infozip-os390.tar.Z").ok() == Some(Mimetype::Archive));
assert!(ft.analyze("tests_filemagic/bla.lha").ok() == Some(Mimetype::Archive));
assert!(ft.analyze("tests_filemagic/dvi.zoo").ok() == Some(Mimetype::Archive));
assert!(ft.analyze("tests_filemagic/rsfs-oztex.sit").ok() == Some(Mimetype::Archive));
assert!(ft.analyze("tests_filemagic/empty.zip").ok() == Some(Mimetype::Zip));
assert!(
ft.analyze("tests_filemagic/README").ok()
== Some(Mimetype::Text(LineEnding::Mixed(0, 0, 0)))
);
// assert!(ft.analyze("tests_filemagic/README1").ok() == Some(Mimetype::Text));
assert!(ft.analyze("tests_filemagic/cp").ok() == Some(Mimetype::Binary));
assert!(ft.analyze("tests_filemagic/cheq-f.sit-hqx").ok() == Some(Mimetype::Binary));
assert!(ft.analyze("tests_filemagic/MuchMore").ok() == Some(Mimetype::Binary));
assert!(ft.analyze("tests_filemagic/support.ps").ok() == Some(Mimetype::Data));
assert!(ft.analyze("tests_filemagic/rosette.eps").ok() == Some(Mimetype::Data));
assert!(ft.analyze("tests_filemagic/eutest.ps").ok() == Some(Mimetype::Data));
// assert!(ft.analyze("tests_filemagic/NORMAL.PS").ok() == Some(Mimetype::Data));
assert!(ft.analyze("tests_filemagic/chap5.rtf").ok() == Some(Mimetype::Data));
assert!(ft.analyze("tests_filemagic/commons-math.jar").ok() == Some(Mimetype::Data));
assert!(
ft.analyze("tests_filemagic/8stbu11h.htm").ok()
== Some(Mimetype::Text(LineEnding::Mixed(0, 1, 8710)))
);
}