# Part of the A-A-P project: File type detection module

#! /usr/bin/env python
#
# Part of the A-A-P project: File type detection module

# Copyright (C) 2002 Stichting NLnet Labs
# Permission to copy and use this file is specified in the file COPYING.
# If this file is missing you can find it here: http://www.a-a-p.org/COPYING

# This module detects the type of a file.
# It can be run as a separate program or called from Python.
# Many types are recognized by default. More types can be added dynamically.
# See filetype.txt for an explanation.
#
#
# EXTERNAL INTERFACE:
#
# ft_detect(fname) Detects the type of file "fname".
#
# ft_check_dir(dir [, errmsg)]
# Scan directory "dir" for "*.afd" files, which are
# loaded with ft_read_file().
#
# ft_read_file(fname) Read file "fname" for detection rules.
#
# ft_add_rules(str) Add file type detection rules from "str". See
# "filetype.txt" for the syntax.
#

import string
import os.path
import re

from Util import *

# Set to non-zero when run as a program.
_run_as_program = 0

#
# The default list of detected file types.
# NOTE: since "append" isn't used, the order of checking is last one first!
#
_def_detect_list = """
suffix c c
suffix h cpp
suffix hh cpp
suffix H cpp
suffix hxx cpp
suffix hpp cpp
suffix cpp cpp
suffix cc cpp
suffix C cpp
suffix c++ cpp
suffix cxx cpp
suffix moc cpp
suffix tcc cpp
suffix inl cpp

suffix py python
suffix pl perl
suffix sh sh
suffix aap aap
suffix afd afd
suffix html html
suffix htm html

suffix Z ignore
suffix gz ignore
suffix bz2 ignore
suffix bak ignore

regexp .*enlightenment/.*.cfg$ c
regexp .*vimrc$ vim
regexp .*\\bconfigure$ sh

script .*\\bpython python
script .*\\bperl perl
script .*csh\\b csh
script .*\\bbash sh
"""

# List of _Ft_py objects: Python code executed to detect file type.
# Used first.
_py_list_before = []

# Dictionary used to map file name extension to file type.
_suffix_dict = {}

# List of _Ft_re objects; a match of the RE with the file name defines the file
# type.
_regexp_list = []

# List of _Ft_re objects: a match of the RE with the script in the first line
# of the file defines the file type.
_script_list = []

# List of _Ft_py objects: Python code executed to detect file type.
# Used after everything else didn't detect the type.
_py_list_after = []

_did_init = 0 # non-zero when __init__() did its work

def __init__():
global _suffix_dict, _regexp_list, _script_list
global _py_list_before, _py_list_after
global _did_init

# this only needs to be done once
if _did_init:
return
_did_init = 1

_py_list_before = []
_suffix_dict = {}
_regexp_list = []
_script_list = []
_py_list_after = []

# Load the built-in detection rules.
ft_add_rules(_def_detect_list)

# Load detection rules from system and user *.afd files.
ft_check_dir("/usr/local/share/aap/afd")
ft_check_dir(os.path.expanduser("~/.aap/afd"))

class DetectError(Exception):
"""Error for something gone wrong."""
def __init__(self, args = None):
self.args = args

def ft_check_dir(dir, errmsg = 0):
"""Check directory "dir" for *.afd files and load them.
When "errmsg" is non-zero give an error message when the directory
doesn't exist."""
if os.path.exists(dir) and os.path.isdir(dir):
for f in glob(os.path.join(dir, "*.afd")):
try:
ft_read_file(f)
except DetectError, e:
if _run_as_program:
print str(e)
else:
from Message import msg_error
msg_error(str(e))
elif errmsg:
e = _('Directory does not exist: "%s"') % dir
if _run_as_program:
print e
else:
from Message import msg_error
msg_error(e)

def ft_read_file(fname):
"""Read file "fname" for file type detection rules."""
try:
file = open(fname)
except IOError, e:
raise DetectError, (_('Cannot open "%s": ') % fname) + str(e)
try:
str = file.read()
except IOError, e:
raise DetectError, (_('Cannot read "%s": ') % fname) + str(e)
file.close()

ft_add_rules(str)

def ft_add_rules(str):
"""Add file type detection rules from string "str"."""
# Always load the default rules first (skipped when done already).
__init__()

# Split the string into individual lines.
lines = string.split(str, '\n')

# Loop over all the lines (may use more than one for python items).
line_idx = 0
line_count = len(lines)
while line_idx < line_count:
line = lines[line_idx]
line_len = len(line)

# isolate first word: type of detection.
ds = skip_white(line, 0) # detection start

# ignore empty and comment lines
if ds == line_len or line[ds] == '#':
line_idx = line_idx + 1
continue

de = skip_to_white(line, ds) # detection end
item = line[ds:de]
as = skip_white(line, de) # argument start

# isolate first argument, which may be in quotes
if as < line_len:
if line[as] == '"' or line[as] == "'":
quote = line[as]
as = as + 1
ae = as
while ae < line_len and line[ae] != quote:
ae = ae + 1
if ae == line_len:
raise DetectError, _('Missing quote in "%s"') % line
n = ae + 1
else:
ae = as
while ae < line_len and line[ae] != ' ' and line[ae] != '\t':
ae = ae + 1
n = ae
arg1 = line[as:ae]
n = skip_white(line, n)
else:
arg1 = ''
n = line_len

# Isolate further arguments (no quotes!).
# A superfluous argument is silently ignore (could be a comment).
args = string.split(line[n:])
if len(args) >= 1:
arg2 = args[0]
else:
arg2 = ''
if len(args) >= 2:
arg3 = args[1]
else:
arg3 = ''

if item == "suffix":
if not arg2:
raise DetectError, _('Missing argument in "%s"') % line
_add_suffix(arg1, arg2)

elif item == "regexp":
if not arg2:
raise DetectError, _('Missing argument in "%s"') % line
_add_regexp(arg1, arg2, arg3 and arg3 == "append")

elif item == "script":
if not arg2:
raise DetectError, _('Missing argument in "%s"') % line
_add_script(arg1, arg2, arg3 and arg3 == "append")

elif item == "python":
append = 0
after = 0
for arg in [arg1, arg2]:
if arg:
if arg == "append":
append = 1
elif arg == "after":
after = 1
else:
raise DetectError, _('Illegal argument in "%s"') % line

start_indent = get_indent(line)
line_idx = line_idx + 1
start_line_idx = line_idx
cmds = ""
while line_idx < line_len:
line = lines[line_idx]
if get_indent(line) <= start_indent:
line_idx = line_idx - 1 # this line has next item
break
cmds = cmds + line + '\n'
line_idx = line_idx + 1
if not cmds:
raise DetectError, _('Python commands missing')
_add_python(cmds, _("filetype detection line %d") % start_line_idx,
after, append)

else:
raise (DetectError,
_("Illegal item %s in argument to ft_add_rules()") % item)

line_idx = line_idx + 1

class _Ft_re:
"""Class used to store pairs of RE and file type."""
def __init__(self, re, type):
self.re = re
self.type = type

class _Ft_py:
"""Class used to store Python code for detecting a file type."""
def __init__(self, code, error_msg):
self.code = code # the Python code
self.error_msg = error_msg # ar message used for errors

def _add_suffix(suf, type):
"""Add detection of "type" by file name extension "suf".
When "type" is "ignore" it means the suffix is removed and further
detection done on the rest.
When "type" is "remove" an existing detection for "suf" is removed."""
if type == 'remove':
if _suffix_dict.has_key(suf):
del _suffix_dict[suf]
else:
_suffix_dict[suf] = type

def _add_regexp(re, type, append):
"""Add detection of "type" by matching the file name with Python regular
expression "re".
When append is non-zero, add to the end of the regexp rules.
When "type" is "remove" an existing detection for "re" is removed."""
if type == 'remove':
for r in _regexp_list:
if r.re == re:
_regexp_list.remove(r)
else:
f = _Ft_re(re, type)
if append:
_regexp_list.append(f)
else:
_regexp_list.insert(0, f)

def _add_script(re, type, append):
"""Add detection of "type" by matching the script name in the first line of
the file with Python regular expression "re".
When append is non-zero, add to the end of the script rules.
When "type" is "remove" an existing detection for "re" is removed."""
if type == 'remove':
for r in _script_list:
if r.re == re:
_script_list.remove(r)
else:
f = _Ft_re(re, type)
if append:
_script_list.append(f)
else:
_script_list.insert(0, f)

def _add_python(code, error_msg, after, append):
"""Add detection of "type" by using Python code "code".
Each line in "code" must end in a '\n'.
"error_msg" is printed when executing the code results in an error.
When "after" is non-zero use this rule after suffix, regexp and script
rules.
When append is non-zero, add to the end of the python rules."""
p = _Ft_py(code, error_msg)
if after:
list = _py_list_after
else:
list = _py_list_before
if append:
list.append(p)
else:
list.insert(0, p)

def _exec_py(fname, item):
"""Execute the code defined with _add_python()."""
# Make a completely fresh globals dictionary.
new_globals = {"fname" : fname}

# Prepend "if 1:" to get the indenting right.
if item.code[0] == ' ' or item.code[0] == '\t':
code = "if 1:\n" + item.code
else:
code = item.code

try:
exec code in new_globals, new_globals
except StandardError, e:
raise DetectError, _(item.error_msg) + str(e)

if new_globals.has_key("type"):
return new_globals["type"]
return None

def ft_detect(fname):
"""Detect the file type for file "fname".
Returns the type as a string or None."""
# Initialize (will skip when done already)
__init__()

# On non-Posix systems we ignore case differences by making the name lower
# case.
if os.name != 'posix':
fname = string.lower(fname)

# Do the python code checks.
for p in _py_list_before:
type = _exec_py(fname, p)
if type:
return type

# Try the extension, this is fastest.
# When "fname" has several extensions, try with all of them first, then
# try by removing the first ones: "f.html.c": "html.c" then ".c".
bn = os.path.basename(fname)
i = string.find(bn, ".")
while i > 0 and i + 1 < len(bn):
# Found a dot that's not the first or last character.
if _suffix_dict.has_key(bn[i + 1:]):
ft = _suffix_dict[bn[i + 1:]]
if ft != "ignore":
return ft
# remove an ignored extension
fname = fname[:-(len(bn[i:]))]
bn = bn[:i]
i = 0
i = string.find(bn, ".", i + 1)

# match all defined REs with the file name.
# TODO: handle "/" in RE and fname.
for r in _regexp_list:
if re.match(r.re, fname):
return r.type

# match all defined REs with the script name in the first line of the
# file.
try:
f = open(fname)
line = f.readline()
f.close()
except:
# Errors for files that can't be read are ignored.
pass
else:
if len(line) > 2 and line[:2] == "#!":
# TODO: remove "env VAR=val" and script arguments from line
for r in _script_list:
if re.match(r.re, line[2:]):
return r.type

# Do the python code checks.
for p in _py_list_after:
type = _exec_py(fname, p)
if type:
return type

return None

# When executed as a program, detect the type of the specified file.
if __name__ == '__main__':
import sys

# Internationalisation inits: setlocale and gettext.
i18n_init()

items = []
checkfile = None
_run_as_program = 1

# Check for any "-Idir", "-I dir", "-ffile" and "-f file" arguments.
next_is_dir = 0
next_is_file = 0
for arg in sys.argv[1:]:
if next_is_dir:
items.extend({"dir" : arg})
next_is_dir = 0
elif next_is_file:
items.extend({"file" : arg})
next_is_file = 0
elif len(arg) >= 2 and arg[:2] == "-I":
if len(arg) > 2:
items.extend({"dir" : arg[2:]})
else:
next_is_dir = 1
elif len(arg) >= 2 and arg[:2] == "-f":
if len(arg) > 2:
items.extend({"file" : arg[2:]})
else:
next_is_file = 1
else:
if checkfile:
print _("Can only check one file")
sys.exit(1)
checkfile = arg

if next_is_dir:
print _("-I argument must be followed by a directory name")
sys.exit(1)
if next_is_file:
print _("-f argument must be followed by a file name")
sys.exit(1)

if not checkfile:
print _("Usage: %s [-I ruledir] [-f rulefile] filename") % sys.argv[0]
sys.exit(1)

# load the built-in default rules
__init__()

# Check specified directories for *.afd files and read specified files.
for item in items:
if item.has_key("dir"):
ft_check_dir(item["dir"])
else:
try:
ft_read_file(item["file"])
except DetectError, e:
print e

print ft_detect(sys.argv[1])

# vim: set sw=4 sts=4 tw=79 fo+=l: