#!/usr/bin/perl
#
# Convert sendmail, postfix, smail, or qmail logs to common log format so
# they can be processed by standard web log processing software.
#
# Here's a sample log entry, in common log format:
#
# [email protected] - - [31/May/1996:13:55:28 -0400] "GET /fred/" 200 541
#
# Meaning that [email protected] sent mail to fred, on the given date, and the
# message was 541 k long.
#
# Only mail that was successfully sent is logged.
#
# Maillog2Commonlog v. 3.2 is copyright 1995, 1996 by Joey Hess.
# May be distributed under the terms of the GPL.
# (http://www.gnu.org/copyleft/gpl.html)
#
# Usage:
#       maillog2commonlog [sendmail|smail|newsmail|qmail] < logfile
#
# Note: if your smail is < version 3.2, then use smail. If it is 3.2 or
# greater, the logfile format changed, and you must use newsmail instead.
#
# Note: it only works for qmail if qmail is set up to log messages via
# syslog. Otherwise, it isn't going to find timestamps.

$logtype=shift;
lc $logtype;
if ($logtype ne 'sendmail' and $logtype ne 'smail' and $logtype ne 'newsmail'
       and $logtype ne 'qmail' and $logtype ne 'postfix') {
       print <<eof;
Usage:
       maillog2commonlog [sendmail|smail|newsmail|qmail] < logfile
eof
       exit;
}

# Enter text to use for a timezone offset here:
$tzoffset=' -0400';

# Enter a list of hosts for which we will log the actual username of the people
# sending/recieving mail. Otherwise, we will just log the hostname.
@pub_hosts=('localhost','box','box.kite.ml.org','kite','kite.ml.org',
       'kite.preferred.com','kitenet.net','box.kitenet.net',
       'kite.kitenet.net');

sub Log { my $message_id=shift;
       print "$msg_buf{$message_id}{from} - - [$msg_buf{$message_id}{day}/$msg_buf{$message_id}{mon}/$year:$msg_buf{$message_id}{time}$tzoffset] \"GET /$msg_buf{$message_id}{to}/\" 200 $msg_buf{$message_id}{size}\n";
       undef $msg_buf{$message_id};
}

sub FixEmail { $_=shift;
       s/[<|>]//g;
       if (m/\@(.*)$/ ne '') {
               if ($pub_hosts_hash{$1}) { ($_)=m/^(.*)\@/ } else { $_=$1 }
       }
       return $_;
}

foreach (@pub_hosts) {
       $pub_hosts_hash{$_}=1;
}

# Could use internal localtime function, but it doesn't tell century..
@_=split/ /,`date`;
$year=@_[$#_];
chomp($year);

# Now on to actually processing the logs. Sendmail and smail use very
# different file formats, sendmail is all on 1 line, smail is a muilt-
# line format that's easier to process, with \n\n seperating each multi-
# line record. And newsmail is ugly ('nuff said..)

if ($logtype eq 'smail') {
       # read in a whole multi-line record at one go.
       $/="\n\n";
}

if ($logtype=~m/smail/) {
       # Set up numeric date to Mmm date translation table for smail.
       my $i=1;
       foreach (Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec) {
               $date_trans[$i++]=$_;
       }
}

while (<>) {
       # There are 2 distinct log lines types, either mail is being recieved or sent.
       # We have to combine the 2 lines to get a clear picture of a mail message.
       # For qmail, there ate 3 log line types: mail recieved, delivery
       # started, and delivery completed.

       if ((/: from=/ ne undef) || (/\] received\n/m ne undef) ||
           (/\] Received / ne undef) || (/info msg .* from/ ne undef)) { # Recieved mail.
               if (/: from=/ ne undef) { # SENDMAIL and POSTFIX
                       ($message_id,$from,$size)=m/\w+\s+\d+\s+\d+:\d+:\d+\s+\w+\s+(?:sendmail|sm-mta|postfix\/qmgr)\[\d+\]:\s+(.*?):\s+from=(.*?),\s+size=(.*?),/;
               }
               elsif (/\] received\n/m ne undef) { # SMAIL
                       ($message_id,$from)=m/^\d+\/\d+\/\d+\s+\d+\:\d+\:\d+\:\s+\[(.*?)\]\s+received\n\|\s+from:\s+(.*?)\n/m;
                       ($size)=m/\|\s+size:\s+(\d+)\s+bytes\n/m;
               }
               elsif (/\] Received / ne undef) { # NEWSMAIL
                       ($message_id)=m/\[(.*?)\]/;
                       ($from)=m/Received FROM:(.*?) /;
                       ($size)=m/SIZE:(\d+)\s/;
               }
               elsif (/info msg .* from/ ne undef) { # QMAIL
                       ($message_id,$size,$from)=m/info msg (\d+): bytes (\d+) from <(.*)>/;
               }

               if (!$from) { $from="unknown" }
               $from=FixEmail($from);

               $msg_buf{$message_id}{from}=$from;
               $msg_buf{$message_id}{size}=$size;

               if ($msg_buf{$message_id}{to}) { &Log($message_id) }
       }
       elsif ((/: to=.*stat(us)?=sent/i ne undef) || (/\] delivered\n/m ne undef) ||
              (/\] Delivered / ne undef) || (/starting delivery/ ne undef)) { # The line logs mail being sent ok.
               if (/: to=.*stat(us)?=sent/i ne undef) {
                       ($mon,$day,$time,$message_id,$to)=m/(\w+)\s+(\d+)\s+(\d+:\d+:\d+)\s+\w+\s+(?:sendmail|sm-mta|postfix\/(?:local|smtp))\[.*?\]:\s+(.*?):\s+to=(.*?),/;
               }
               elsif (/\] delivered\n/m ne undef) {
                       ($mon,$day,$time,$message_id,$to)=m/(\d+)\/(\d+)\/\d+\s+(\d+:\d+:\d+):\s\[(.*?)\] delivered\n\|\s+to:\s+(.*?)\n/m;
                       $mon=$date_trans[$mon]; # Translate to Mmm format.
               }
               elsif (/\] Delivered / ne undef) {
                       ($mon,$day,$time,$message_id)=m/(\d+)\/(\d+)\/\d+\s+(\d+:\d+:\d+):\s\[(.*?)\]/;
                       ($to)=m/TO:(.*?)\s/;
                       $mon=$date_trans[$mon]; # Translate to Mmm format.
               }
               elsif (/starting delivery/ ne undef) {
                       ($mon,$day,$time,$message_id,$to)=m/^(\w+)\s+(\d+)\s+(\d+:\d+:\d+)\s+.*\s+msg\s+(\d+)\s+to\s+.*?\s+(.*)$/;
               }

               $to=FixEmail($to);
               if (length($day) eq 1 ) { $day="0$day" }

               $msg_buf{$message_id}{mon}=$mon;
               $msg_buf{$message_id}{day}=$day;
               $msg_buf{$message_id}{time}=$time;
               $msg_buf{$message_id}{to}=$to;

               if ($msg_buf{$message_id}{from}) { &Log($message_id) }
       }
}