#!/usr/local/bin/perl5
# bmonitor v2.21 for LSF
# Christian Rossi ([email protected])
# Centre Charles Hermite/LORIA (http://cch.loria.fr/LSF/bmonitor) - Nancy - France
# License for bmonitor : GNU General Public License
# (http://www.gnu.org/copyleft/gpl.html)
# v2.0  2000/03/20
# v2.01 2000/04/25
# v2.02 2000/04/27
# v2.1  2000/06/05
# v2.11 2000/07/07 new f_nb_proc
# v2.12 2000/09/05 modification of f_nb_proc
# v2.13 2000/09/18 modif of CPUTIM display
# v2.14 2000/09/21 modification of f_nb_proc
# v2.2  2000/10/03 add wait time for pending job on RUN/WA (fonction f_wait_time)
# v2.21 2000/10/12 hog value = 0 for pending job

my $VERSION = 2.21;

system("clear");

# first display
$first = "true";

while (1)
 {

   # counter for the not pending jobs
   $job_count = 0;

   # default : bjobs -u all
   if (! @ARGV)
     {
       open(BJOBS,"bjobs -u all  2>&1|");
     } else
       {
         open(BJOBS,"bjobs @ARGV 2>&1|");
       }

   # if no job exit
   $first_ligne=<BJOBS>;
   if ($first_ligne =~ /No.*job found/)
     {
       print "$_";
       exit;
     }

   # other possible display
   #print "JOBID USER     STAT  QUEUE     PROC MEM  SWAP CPUTIME RUNTIME RUNLIM  EFF    HOG  EXECHOST  SUBMITTIME\n";

   $p_first_ligne ="CPU  MEM  SWAP CPUTIM  JOB   USER    STAT    QUEUE    RUN/WA RUNLIM   EFF    HOG  EXECHOST  SUBMIT_TIME\n";
   $p_second_ligne="-------------------------------------------------------------------------------------------------------\n";

   # first time print now, after print in buffer
   if ($first ne "true")
     {
       $p_lignes = "$p_first_ligne" . "$p_second_ligne";
     } else
       {
         print $p_first_ligne;
         print $p_second_ligne;
       }

   @lignes_jobs=<BJOBS>;

   # lignes of bjob
   foreach $ligne (@lignes_jobs)
     {

       # work with ligne that begin with a job number
       if ($ligne =~ /^[0-9][0-9][0-9][0-9]/)
         {

           # decrease time to wait for each not pending job and print it
           if (($first ne "true") && ($stat ne PEND))
             {
               if ($old_job_count < 0)
                 {
                   $old_job_count = 0;
                 }
               $p_old_job_count = sprintf("\rbmonitor $VERSION - %s - Update in %ss ",$date,$old_job_count);
               syswrite(STDOUT,"$p_old_job_count",55);
               $old_job_count = $old_job_count - 1;
             }

           ($jobid,$user,$stat,$queue,$from_host,$exec_host)=split(/ +/,$ligne);

           open(BJOBSL,"bjobs -l $jobid |");
           @bjobsl_out = <BJOBSL>;
           open(BHIST,"bhist $jobid|");
          @bhist_out = <BHIST>;
           if ($stat ne PEND)
             {

               ## run bhist if job is not pending
               ##open(BHIST,"bhist $jobid|");
               ##@bhist_out = <BHIST>;

               $job_count = $job_count + 1;

               # job name
               $job_name = $ligne;
               $job_name =~ s/^.{56}//;
               $job_name =~ s/.{13}$//;
               chop($job_name);

               # submit time
               $submit_time = $ligne;
               $submit_time =~ s/^.{67}//;
               chop($submit_time);

               # number of proc
               $nb_proc = &f_nb_proc($jobid);

               # memory and swap
               @mem_swap = &f_mem_swap($jobid);
               $mem = $mem_swap[0];
               $swap = $mem_swap[1];

               # cpu time
               @total_hour_min = &f_cpu_time($jobid);
               $cpu_time_in_sec = $total_hour_min[0];
               $cpu_time_hour = $total_hour_min[1];
               $cpu_time_min = $total_hour_min[2];

               # run time
               @total_hour_min = &f_run_time($jobid);
               $run_time_in_sec = $total_hour_min[0];
               $run_time_hour = $total_hour_min[1];
               $run_time_min = $total_hour_min[2];

               # run limit
               @total_hour_min = &f_run_limit($jobid);
               $run_limit_in_sec = $total_hour_min[0];
               $run_limit_hour = $total_hour_min[1];
               $run_limit_min = $total_hour_min[2];

               # eff (100 * cpu_time / (nb_proc * run_time))
               $efficasity = &f_efficasity($jobid);

               # hog factor (100 * run_time / total_time)
               $hog_factor = &f_hog_factor($jobid);

             }                 # if ne PEND

           if ($stat eq PEND)
             {

               # job name
               $job_name = $ligne;
               $job_name =~ s/^.{66}//;
               $job_name =~ s/.{13}$//;
               chop($job_name);

               # submit time
               $submit_time = $ligne;
               $submit_time =~ s/^.{67}//;
               $submit_time =~ s/.{13}$//;
               chop($submit_time);

               # number of proc
               $nb_proc = &f_nb_proc($jobid);

               # memory and swap
               $mem  = 0;
               $swap = 0;

               # cpu time
               $cpu_time_in_sec = 0;
               $cpu_time_hour   = 0;
               $cpu_time_min    = 0;

               # run time
               $run_time_in_sec = 0;
               $run_time_hour   = 0;
               $run_time_min    = 0;

               # wait time (use runtime place for display)
               @wait_total_hour_min  = &f_wait_time($jobid);
               $wait_time_hour       = $wait_total_hour_min[1];
               $wait_time_min        = $wait_total_hour_min[2];

               # run limit
               @total_hour_min   = &f_run_limit($jobid);
               $run_limit_in_sec = $total_hour_min[0];
               $run_limit_hour   = $total_hour_min[1];
               $run_limit_min    = $total_hour_min[2];

               # hog factor (100 * run_time / total_time with run_time = 0)
               $hog_factor = 0;
               #$hog_factor = &f_hog_factor($jobid);

             }                 # end (if eq PEND)

           # format value with sprintf
           $p_jobid      = sprintf("%5s",$jobid);
           $p_user       = sprintf("%-8s",$user);
           $p_stat       = sprintf("%-5s",$stat);
           $p_queue      = sprintf("%-10s",$queue);
           $p_nb_proc    = sprintf("%2s",$nb_proc);
           $p_mem        = sprintf("%5.0f",$mem);
           $p_swap       = sprintf("%5.0f",$swap);
           $p_run_limit  = sprintf("%3s:%02d",$run_limit_hour,$run_limit_min);
           $p_hog_factor = sprintf("%5.1f%%",$hog_factor);

           if ($stat ne PEND)
             {
               if ($cpu_time_hour < 1000) {
               $p_cpu_time   = sprintf("%3s:%02d",$cpu_time_hour,$cpu_time_min);
               } else
               {
               $cpu_time_min = $cpu_time_min / 10;
               $p_cpu_time   = sprintf("%4s:%01d",$cpu_time_hour,$cpu_time_min);
               }
               $p_run_time   = sprintf("%3s:%02d",$run_time_hour,$run_time_min);
               $p_exec_host  = sprintf("%-8s",$exec_host);
               $p_efficasity = sprintf("%5.1f%%",$efficasity);
               $p_hog_factor = sprintf("%5.1f%%",$hog_factor);
               $p_mem        = sprintf("%5.0f",$mem);
               $p_swap       = sprintf("%5.0f",$swap);
               } else
               {
                 #$p_mem        = sprintf("%5s");
                 #$p_swap       = sprintf("%5s");
                 #$p_cpu_time   = sprintf("%6s");
                 $p_cpu_time    = sprintf("%3s:%02d",$cpu_time_hour,$cpu_time_min);
                 $p_run_time    = sprintf("%3s:%02d",$wait_time_hour,$wait_time_min);
                 $p_exec_host   = sprintf("%8s");
                 $p_efficasity  = sprintf("%6s");
                 #$p_hog_factor = sprintf("%6s");
               }

           if ($first ne "true")
             {
               $lignes = sprintf("%s %s %s %s %s %s %s %s %s %s %s %s %s %s\n",$p_nb_proc,$p_mem,$p_swap,$p_cpu_time,$p_jobid,$p_user,$p_stat,$p_queue,$p_run_time,$p_run_limit,$p_efficasity,$p_hog_factor,$p_exec_host,$submit_time);
               $p_lignes = "$p_lignes" . "$lignes";
             } else
               {
                 print "$p_nb_proc $p_mem $p_swap $p_cpu_time $p_jobid $p_user $p_stat $p_queue $p_run_time $p_run_limit $p_efficasity $p_hog_factor $p_exec_host $submit_time\n"
               }

         }                     # if begin with job number [0-9]
     }                         # foreach ligne of bjob

   if ($first eq "false")
     {
       # display the job in one time
       system("clear");
       print "$p_lignes";
     }

   # next display
   $first = "false";

   # print bhosts and lsload
   print "\n";
   system("bhosts");
   print "\n";
   system("lsload");
   print "\n";

   # date: 2000/03/21 10:44:46
   $date=`date '+%Y/%m/%d %H:%M:%S'`;
   chop($date);

   # user wait for about 50 sec
   $old_job_count = 50;
   if ($job_count <= 50)
     {
       # wait time = sleep time + time to run bjob -l and bhist
       $sleep_time = 50 - $job_count ;
     }

   # print the date and the delay before update
   for ($t=0; $t <= $sleep_time ; $t++)
     {
       $p_old_job_count = sprintf("\rbmonitor $VERSION - %s - Update in %ss ",$date,$old_job_count);
       syswrite(STDOUT,"$p_old_job_count",55);
       $old_job_count = $old_job_count - 1;
       sleep(1);
     }

 }                             # while 1

# other possible display
#print "$p_jobid $p_user $p_stat $p_queue $p_nb_proc $p_mem $p_swap $p_cpu_time $p_run_time $p_run_limit $p_efficasity $p_hog_factor $p_exec_host $submit_time\n"

########################################################################################
########################################################################################
## fonctions for bmonitor                                                             ##
########################################################################################
########################################################################################

######################################
# f__efficasity
# cpu time / (nb_proc * run_time)
# use bhist $jobid
######################################

sub f_efficasity{

 $l_effi = "";
 $l_nb_proc = $nb_proc;
 $l_run_time = $run_time_in_sec;
 $l_cpu_time = $cpu_time_in_sec;
 $l_effi = $l_nb_proc * $l_run_time;

 if ( $l_effi > 0 )
   {
     $l_effi = 100 * $l_cpu_time / $l_effi;
     if ( $l_effi > 999.9 )
       { $l_effi = 999.9 }
   } else
     {
       $l_effi = 999.9;
     }
 return "$l_effi";
}

######################################
# f_mem_swap
# memory ans swap of the current job
# use bjob -l $jobid
######################################

sub f_mem_swap
 {
   $l_mem = "";
   $l_swap = "";
   foreach $l_ligne (@bjobsl_out)
     {
       if ($l_ligne =~ /MEM: /)
         {
           @l_word=split(/ +/,$l_ligne) ;
           $l_mem = $l_word[2];
           $l_mem_unit = $l_word[3];
           $l_swap = $l_word[5];
           $l_swap_unit = $l_word[6];
           last;
         }
     }

   if ($l_mem_unit eq "Kbytes;")
     {
       $l_mem = $l_mem / 1024;
     }

   if ($l_swap_unit eq "Kbytes;")
     {
       $l_swap = $l_swap / 1024;
     }

   if ($l_swap eq "")
     {
       $l_swap = "0";
     }
   if ($l_mem eq "")
     {
       $l_mem = "0";
     }
   return ($l_mem,$l_swap);
 }

#####################################################
# f_cpu_time
# cpu time of the current job
# $l_cpu_time: total cpu time in second
# $l_hour, $l_min:  hours and minutes of the cpu time
# use bjob -l $jobid
#####################################################

sub f_cpu_time
 {
   $l_cpu_time = "";
   foreach $l_ligne (@bjobsl_out)
     {
       if ($l_ligne =~ /seconds/)
         {
           @l_word=split(/ +/,$l_ligne) ;
           $l_cpu_time=$l_word[6];
           last;
         }

     }

   # conversion in hh:mm
   $l_sec = 0;
   $l_min = 0;
   $l_hour = 0;

   if ( $l_cpu_time >= 60 )
     {
       $l_min = $l_cpu_time / 60;
       $l_min = int($l_min);
       $l_sec = $l_cpu_time - ($l_min * 60) ;
     }
   if ( "$l_min" >= 60 )
     {
       $l_hour = $l_min / 60;
       $l_hour = int($l_hour);
       $l_min = $l_min - ($l_hour * 60);
     }

   return ($l_cpu_time,$l_hour,$l_min);
 }

#######################################################
# f_run_time
# run time of the current job
# $l_run_time: total run time in second
# $l_hour, $l_min:  hours and minutes of the run time
# use bhist $jobid
#######################################################

sub f_run_time
 {
   $l_run_time = "";
   foreach $l_ligne (@bhist_out)
     {
       if ($l_ligne =~ /^[0-9]{3,}/)
         {
           # for not modify @bhist_out
           $l_copy = $l_ligne;
           # delete job name because space caracter
           $l_copy =~ s/.{25}//;
           @l_word = split(/ +/,$l_copy) ;
           $l_run_time = $l_word[3];
           last;
         }
     }

   # conversion in hh:mm
   $l_sec = 0;
   $l_min = 0;
   $l_hour = 0;

   if ( $l_run_time >= 60 )
     {
       $l_min = $l_run_time / 60;
       $l_min = int($l_min);
       $l_sec = $l_run_time - ($l_min * 60) ;
     }

   if ( "$l_min" >= 60 )
     {
       $l_hour = $l_min / 60;
       $l_hour = int($l_hour);
       $l_min = $l_min - ($l_hour * 60);
     }

   return ($l_run_time,$l_hour,$l_min);
 }

#######################################################
# f_wait_time
# wait time of the current job (before start)
# $l_wait_time: total wait time in second
# $l_hour, $l_min:  hours and minutes of the wait time
# use bhist $jobid
#######################################################

sub f_wait_time
 {
   $l_wait_time = 0;
   foreach $l_ligne (@bhist_out)
     {
       if ($l_ligne =~ /^[0-9]{3,}/)
         {
           # for not modify @bhist_out
           $l_copy = $l_ligne;
           # delete job name because space caracter
           $l_copy =~ s/.{25}//;
           @l_word = split(/ +/,$l_copy) ;
           $l_wait_time = $l_word[1];
           last;
         }
     }

   # conversion in hh:mm
   $l_sec = 0;
   $l_min = 0;
   $l_hour = 0;

   if ( $l_wait_time >= 60 )
     {
       $l_min = $l_wait_time / 60;
       $l_min = int($l_min);
       $l_sec = $l_wait_time - ($l_min * 60) ;
     }

   if ( "$l_min" >= 60 )
     {
       $l_hour = $l_min / 60;
       $l_hour = int($l_hour);
       $l_min = $l_min - ($l_hour * 60);
     }

   return ($l_wait_time,$l_hour,$l_min);
 }


###########################################################
# f_run_limit
# run limit of the current job
# $l_run_limit: total limit time in second
# $l_hour, $l_min: hours and minutes of the limit time
# users must ask for a run limit (else it's a cpu limit)
# use bjob -l $jobid
###########################################################

sub f_run_limit
 {
   $l_run_limit = "";
   foreach $l_ligne (@bjobsl_out)
     {
       if ($l_ligne =~ /[0-9] min of/)
         {
           @l_word = split(/ +/,$l_ligne) ;
           $l_run_limit = $l_word[1];
           # if cpu limit and run limit
           if ($l_word[5] =~ /[0-9]/)
             {
               $l_run_limit = $l_word[5];
             }
           last;
         }
     }

   $l_sec=0;
   $l_min=0;
   $l_hour=0;

   if ( $l_run_limit >= 60 )
     {
       $l_hour = $l_run_limit / 60;
       $l_hour = int($l_hour);
       $l_min = $l_run_limit - ($l_hour * 60);
     } else
       {
         $l_hour = 0;
         $l_min = int($l_run_limit);
       }

   return ($l_run_limit,$l_hour,$l_min);

 }

########################################################
# f_nb_proc
# number of processors of the job
# search Processors in paragraph Submitted from host
# use bjobs -l $jobid$
##########################################################

sub f_nb_proc
 {

   $l_nb_cpu = "";
   $l_paragraph = "";
   $l_lignes_proc = false;

   foreach $l_ligne (@bjobsl_out)
     {
       # if no empty ligne and in good paragraph
       if (($l_ligne =~ /[\w\d]/) & ($l_lignes_proc eq true))
         {
           # delete space at the begining
           $l_ligne =~ s/^\s+//g;
           $l_ligne  =~ s/,/, /g;
           $l_ligne =~ s/Processors/Processors /;
           # delete RC
           chop($l_ligne);
           # add each ligne of the paragraph
           $l_paragraph = "$l_paragraph" . "$l_ligne";
         }
       # empty ligne: end of the good paragraph and search nb of proc
       elsif (($l_ligne =~ /^$/) & ($l_lignes_proc eq true))
         { $l_lignes_proc = false;}
       # enter in good paragraph
       elsif (($l_ligne =~ /Submitted from host/) && ($l_lignes_proc eq false))
         {
           chop($l_ligne);
           $l_paragraph = $l_ligne;
           $l_paragraph =~ s/,/, /g;
           $l_paragraph =~ s/Processors/Processors /;
           $l_lignes_proc = true;
         }                     # Submitted from host

     }                         # foreach ligne

   $l_lignes_proc = false;
   @l_words = split(/ +/,$l_paragraph);
   foreach $l_word (@l_words)
     {
       if ($l_word eq Processors)
         {
           $l_nb_cpu = $l_prev_word;
           last;
         }
       elsif ($l_word =~ /Processors/)
         {
           $l_nb_cpu = $l_word;
           $l_nb_cpu =~ s/Processors//;
           last;
         }                     # fi $word ~ Processors
       $l_prev_word = $l_word;
     }                         # foreach

   if ( ! $l_nb_cpu )
     {
       $l_nb_cpu = 1;
     }
   return $l_nb_cpu;

 }

########################################################
# f_hog_factor
# hog factor = run time / turnaround time
# turnaround time = PEND + PSUSP + RUN + USUSP + SSUSP
# use bhist $jobid
########################################################

sub f_hog_factor
 {
   $l_total_time = 0;
   $l_run_time = 0;
   foreach $l_ligne (@bhist_out)
     {

       if ($l_ligne =~ /^[0-9]{3,}/)
         {
           # for not modify @bhist_out
           $l_copy = $l_ligne;
           # delete job name because space caracter
           $l_copy =~ s/.{25}//;
           @l_word = split(/ +/,$l_copy);
           $l_run_time = $l_word[3];
           if ($l_run_time  eq "") { $l_run_time = 0; }
           $l_total_time = $l_word[7];
           if ($l_total_time eq "") { $l_total_time = 1; }
           last;
         }                     # fi
     }                         # foreach $l_ligne

   if ($l_total_time > 0)
     {
       $l_hog_factor = 100 * $l_run_time / $l_total_time;
if ( $l_hog_factor > 999.9 )
{ $l_hog_factor = 999.9;
}
} else { # $l_total_time = 0
$l_hog_factor = 999.9;
}

return $l_hog_factor;

}

#
#
#########################################################################

__END__

=head1 NAME

bmonitor - display information about LSF jobs and hosts

=head1 DESCRIPTION

bmonitor is a perl script to monitor LSF jobs.
Every minute the script show for each jobs useful informations.
This script use LSF (Load Sharing Facility).

=head1 README

bmonitor is a perl script to monitor LSF jobs.
Every minute the script show for each jobs useful informations:

   CPU : number of processors asked par the users
   MEM : memory used by the job (MB)
   SWAP : swap used by the job (MB)
   CPUTIM : cpu time of the job (hh:mm)
   JOB : number identification of the job
   USER : user login
   STAT : status of the job (PEND, PSUSP, USUSP, SSUSP, RUN)
   QUEUE : name of the queue
   RUN/WA : run ou wait time, time spent by the job in RUN status for
            runing job or time spent in PEND status for pending job (hh:mm)
   RUNLIM : maximun run time value asked by the user (hh:mm)
   EFF : cpu time / (run time * number of proc)
   HOG : run time / total time the job spend in LSF
   EXECHOST : execution host
   SUBMIT_TIME : date of the soumission

The options of bjob can be use with bmonitor.

This perl script is disponible from http://cch.loria.fr/LSF/bmonitor/
and use the GNU General Public License.

For more informations send a mail to [email protected].

Centre Charles Hermite/LORIA  - Nancy - France
http://cch.loria.fr/
http://www.loria.fr/~rossi

=head1 AUTHOR

Christian rossi <[email protected]>

=pod OSNAMES

Unix

=pod SCRIPT CATEGORIES

UNIX/System_administration

=cut