#!/usr/local/bin/perl5
# bmonitor v2.21 for LSF
# Christian Rossi (
[email protected])
# Centre Charles Hermite/LORIA (
http://cch.loria.fr/LSF/bmonitor) - Nancy - France
# License for bmonitor : GNU General Public License
# (
http://www.gnu.org/copyleft/gpl.html)
# v2.0 2000/03/20
# v2.01 2000/04/25
# v2.02 2000/04/27
# v2.1 2000/06/05
# v2.11 2000/07/07 new f_nb_proc
# v2.12 2000/09/05 modification of f_nb_proc
# v2.13 2000/09/18 modif of CPUTIM display
# v2.14 2000/09/21 modification of f_nb_proc
# v2.2 2000/10/03 add wait time for pending job on RUN/WA (fonction f_wait_time)
# v2.21 2000/10/12 hog value = 0 for pending job
my $VERSION = 2.21;
system("clear");
# first display
$first = "true";
while (1)
{
# counter for the not pending jobs
$job_count = 0;
# default : bjobs -u all
if (! @ARGV)
{
open(BJOBS,"bjobs -u all 2>&1|");
} else
{
open(BJOBS,"bjobs @ARGV 2>&1|");
}
# if no job exit
$first_ligne=<BJOBS>;
if ($first_ligne =~ /No.*job found/)
{
print "$_";
exit;
}
# other possible display
#print "JOBID USER STAT QUEUE PROC MEM SWAP CPUTIME RUNTIME RUNLIM EFF HOG EXECHOST SUBMITTIME\n";
$p_first_ligne ="CPU MEM SWAP CPUTIM JOB USER STAT QUEUE RUN/WA RUNLIM EFF HOG EXECHOST SUBMIT_TIME\n";
$p_second_ligne="-------------------------------------------------------------------------------------------------------\n";
# first time print now, after print in buffer
if ($first ne "true")
{
$p_lignes = "$p_first_ligne" . "$p_second_ligne";
} else
{
print $p_first_ligne;
print $p_second_ligne;
}
@lignes_jobs=<BJOBS>;
# lignes of bjob
foreach $ligne (@lignes_jobs)
{
# work with ligne that begin with a job number
if ($ligne =~ /^[0-9][0-9][0-9][0-9]/)
{
# decrease time to wait for each not pending job and print it
if (($first ne "true") && ($stat ne PEND))
{
if ($old_job_count < 0)
{
$old_job_count = 0;
}
$p_old_job_count = sprintf("\rbmonitor $VERSION - %s - Update in %ss ",$date,$old_job_count);
syswrite(STDOUT,"$p_old_job_count",55);
$old_job_count = $old_job_count - 1;
}
($jobid,$user,$stat,$queue,$from_host,$exec_host)=split(/ +/,$ligne);
open(BJOBSL,"bjobs -l $jobid |");
@bjobsl_out = <BJOBSL>;
open(BHIST,"bhist $jobid|");
@bhist_out = <BHIST>;
if ($stat ne PEND)
{
## run bhist if job is not pending
##open(BHIST,"bhist $jobid|");
##@bhist_out = <BHIST>;
$job_count = $job_count + 1;
# job name
$job_name = $ligne;
$job_name =~ s/^.{56}//;
$job_name =~ s/.{13}$//;
chop($job_name);
# submit time
$submit_time = $ligne;
$submit_time =~ s/^.{67}//;
chop($submit_time);
# number of proc
$nb_proc = &f_nb_proc($jobid);
# memory and swap
@mem_swap = &f_mem_swap($jobid);
$mem = $mem_swap[0];
$swap = $mem_swap[1];
# cpu time
@total_hour_min = &f_cpu_time($jobid);
$cpu_time_in_sec = $total_hour_min[0];
$cpu_time_hour = $total_hour_min[1];
$cpu_time_min = $total_hour_min[2];
# run time
@total_hour_min = &f_run_time($jobid);
$run_time_in_sec = $total_hour_min[0];
$run_time_hour = $total_hour_min[1];
$run_time_min = $total_hour_min[2];
# run limit
@total_hour_min = &f_run_limit($jobid);
$run_limit_in_sec = $total_hour_min[0];
$run_limit_hour = $total_hour_min[1];
$run_limit_min = $total_hour_min[2];
# eff (100 * cpu_time / (nb_proc * run_time))
$efficasity = &f_efficasity($jobid);
# hog factor (100 * run_time / total_time)
$hog_factor = &f_hog_factor($jobid);
} # if ne PEND
if ($stat eq PEND)
{
# job name
$job_name = $ligne;
$job_name =~ s/^.{66}//;
$job_name =~ s/.{13}$//;
chop($job_name);
# submit time
$submit_time = $ligne;
$submit_time =~ s/^.{67}//;
$submit_time =~ s/.{13}$//;
chop($submit_time);
# number of proc
$nb_proc = &f_nb_proc($jobid);
# memory and swap
$mem = 0;
$swap = 0;
# cpu time
$cpu_time_in_sec = 0;
$cpu_time_hour = 0;
$cpu_time_min = 0;
# run time
$run_time_in_sec = 0;
$run_time_hour = 0;
$run_time_min = 0;
# wait time (use runtime place for display)
@wait_total_hour_min = &f_wait_time($jobid);
$wait_time_hour = $wait_total_hour_min[1];
$wait_time_min = $wait_total_hour_min[2];
# run limit
@total_hour_min = &f_run_limit($jobid);
$run_limit_in_sec = $total_hour_min[0];
$run_limit_hour = $total_hour_min[1];
$run_limit_min = $total_hour_min[2];
# hog factor (100 * run_time / total_time with run_time = 0)
$hog_factor = 0;
#$hog_factor = &f_hog_factor($jobid);
} # end (if eq PEND)
# format value with sprintf
$p_jobid = sprintf("%5s",$jobid);
$p_user = sprintf("%-8s",$user);
$p_stat = sprintf("%-5s",$stat);
$p_queue = sprintf("%-10s",$queue);
$p_nb_proc = sprintf("%2s",$nb_proc);
$p_mem = sprintf("%5.0f",$mem);
$p_swap = sprintf("%5.0f",$swap);
$p_run_limit = sprintf("%3s:%02d",$run_limit_hour,$run_limit_min);
$p_hog_factor = sprintf("%5.1f%%",$hog_factor);
if ($stat ne PEND)
{
if ($cpu_time_hour < 1000) {
$p_cpu_time = sprintf("%3s:%02d",$cpu_time_hour,$cpu_time_min);
} else
{
$cpu_time_min = $cpu_time_min / 10;
$p_cpu_time = sprintf("%4s:%01d",$cpu_time_hour,$cpu_time_min);
}
$p_run_time = sprintf("%3s:%02d",$run_time_hour,$run_time_min);
$p_exec_host = sprintf("%-8s",$exec_host);
$p_efficasity = sprintf("%5.1f%%",$efficasity);
$p_hog_factor = sprintf("%5.1f%%",$hog_factor);
$p_mem = sprintf("%5.0f",$mem);
$p_swap = sprintf("%5.0f",$swap);
} else
{
#$p_mem = sprintf("%5s");
#$p_swap = sprintf("%5s");
#$p_cpu_time = sprintf("%6s");
$p_cpu_time = sprintf("%3s:%02d",$cpu_time_hour,$cpu_time_min);
$p_run_time = sprintf("%3s:%02d",$wait_time_hour,$wait_time_min);
$p_exec_host = sprintf("%8s");
$p_efficasity = sprintf("%6s");
#$p_hog_factor = sprintf("%6s");
}
if ($first ne "true")
{
$lignes = sprintf("%s %s %s %s %s %s %s %s %s %s %s %s %s %s\n",$p_nb_proc,$p_mem,$p_swap,$p_cpu_time,$p_jobid,$p_user,$p_stat,$p_queue,$p_run_time,$p_run_limit,$p_efficasity,$p_hog_factor,$p_exec_host,$submit_time);
$p_lignes = "$p_lignes" . "$lignes";
} else
{
print "$p_nb_proc $p_mem $p_swap $p_cpu_time $p_jobid $p_user $p_stat $p_queue $p_run_time $p_run_limit $p_efficasity $p_hog_factor $p_exec_host $submit_time\n"
}
} # if begin with job number [0-9]
} # foreach ligne of bjob
if ($first eq "false")
{
# display the job in one time
system("clear");
print "$p_lignes";
}
# next display
$first = "false";
# print bhosts and lsload
print "\n";
system("bhosts");
print "\n";
system("lsload");
print "\n";
# date: 2000/03/21 10:44:46
$date=`date '+%Y/%m/%d %H:%M:%S'`;
chop($date);
# user wait for about 50 sec
$old_job_count = 50;
if ($job_count <= 50)
{
# wait time = sleep time + time to run bjob -l and bhist
$sleep_time = 50 - $job_count ;
}
# print the date and the delay before update
for ($t=0; $t <= $sleep_time ; $t++)
{
$p_old_job_count = sprintf("\rbmonitor $VERSION - %s - Update in %ss ",$date,$old_job_count);
syswrite(STDOUT,"$p_old_job_count",55);
$old_job_count = $old_job_count - 1;
sleep(1);
}
} # while 1
# other possible display
#print "$p_jobid $p_user $p_stat $p_queue $p_nb_proc $p_mem $p_swap $p_cpu_time $p_run_time $p_run_limit $p_efficasity $p_hog_factor $p_exec_host $submit_time\n"
########################################################################################
########################################################################################
## fonctions for bmonitor ##
########################################################################################
########################################################################################
######################################
# f__efficasity
# cpu time / (nb_proc * run_time)
# use bhist $jobid
######################################
sub f_efficasity{
$l_effi = "";
$l_nb_proc = $nb_proc;
$l_run_time = $run_time_in_sec;
$l_cpu_time = $cpu_time_in_sec;
$l_effi = $l_nb_proc * $l_run_time;
if ( $l_effi > 0 )
{
$l_effi = 100 * $l_cpu_time / $l_effi;
if ( $l_effi > 999.9 )
{ $l_effi = 999.9 }
} else
{
$l_effi = 999.9;
}
return "$l_effi";
}
######################################
# f_mem_swap
# memory ans swap of the current job
# use bjob -l $jobid
######################################
sub f_mem_swap
{
$l_mem = "";
$l_swap = "";
foreach $l_ligne (@bjobsl_out)
{
if ($l_ligne =~ /MEM: /)
{
@l_word=split(/ +/,$l_ligne) ;
$l_mem = $l_word[2];
$l_mem_unit = $l_word[3];
$l_swap = $l_word[5];
$l_swap_unit = $l_word[6];
last;
}
}
if ($l_mem_unit eq "Kbytes;")
{
$l_mem = $l_mem / 1024;
}
if ($l_swap_unit eq "Kbytes;")
{
$l_swap = $l_swap / 1024;
}
if ($l_swap eq "")
{
$l_swap = "0";
}
if ($l_mem eq "")
{
$l_mem = "0";
}
return ($l_mem,$l_swap);
}
#####################################################
# f_cpu_time
# cpu time of the current job
# $l_cpu_time: total cpu time in second
# $l_hour, $l_min: hours and minutes of the cpu time
# use bjob -l $jobid
#####################################################
sub f_cpu_time
{
$l_cpu_time = "";
foreach $l_ligne (@bjobsl_out)
{
if ($l_ligne =~ /seconds/)
{
@l_word=split(/ +/,$l_ligne) ;
$l_cpu_time=$l_word[6];
last;
}
}
# conversion in hh:mm
$l_sec = 0;
$l_min = 0;
$l_hour = 0;
if ( $l_cpu_time >= 60 )
{
$l_min = $l_cpu_time / 60;
$l_min = int($l_min);
$l_sec = $l_cpu_time - ($l_min * 60) ;
}
if ( "$l_min" >= 60 )
{
$l_hour = $l_min / 60;
$l_hour = int($l_hour);
$l_min = $l_min - ($l_hour * 60);
}
return ($l_cpu_time,$l_hour,$l_min);
}
#######################################################
# f_run_time
# run time of the current job
# $l_run_time: total run time in second
# $l_hour, $l_min: hours and minutes of the run time
# use bhist $jobid
#######################################################
sub f_run_time
{
$l_run_time = "";
foreach $l_ligne (@bhist_out)
{
if ($l_ligne =~ /^[0-9]{3,}/)
{
# for not modify @bhist_out
$l_copy = $l_ligne;
# delete job name because space caracter
$l_copy =~ s/.{25}//;
@l_word = split(/ +/,$l_copy) ;
$l_run_time = $l_word[3];
last;
}
}
# conversion in hh:mm
$l_sec = 0;
$l_min = 0;
$l_hour = 0;
if ( $l_run_time >= 60 )
{
$l_min = $l_run_time / 60;
$l_min = int($l_min);
$l_sec = $l_run_time - ($l_min * 60) ;
}
if ( "$l_min" >= 60 )
{
$l_hour = $l_min / 60;
$l_hour = int($l_hour);
$l_min = $l_min - ($l_hour * 60);
}
return ($l_run_time,$l_hour,$l_min);
}
#######################################################
# f_wait_time
# wait time of the current job (before start)
# $l_wait_time: total wait time in second
# $l_hour, $l_min: hours and minutes of the wait time
# use bhist $jobid
#######################################################
sub f_wait_time
{
$l_wait_time = 0;
foreach $l_ligne (@bhist_out)
{
if ($l_ligne =~ /^[0-9]{3,}/)
{
# for not modify @bhist_out
$l_copy = $l_ligne;
# delete job name because space caracter
$l_copy =~ s/.{25}//;
@l_word = split(/ +/,$l_copy) ;
$l_wait_time = $l_word[1];
last;
}
}
# conversion in hh:mm
$l_sec = 0;
$l_min = 0;
$l_hour = 0;
if ( $l_wait_time >= 60 )
{
$l_min = $l_wait_time / 60;
$l_min = int($l_min);
$l_sec = $l_wait_time - ($l_min * 60) ;
}
if ( "$l_min" >= 60 )
{
$l_hour = $l_min / 60;
$l_hour = int($l_hour);
$l_min = $l_min - ($l_hour * 60);
}
return ($l_wait_time,$l_hour,$l_min);
}
###########################################################
# f_run_limit
# run limit of the current job
# $l_run_limit: total limit time in second
# $l_hour, $l_min: hours and minutes of the limit time
# users must ask for a run limit (else it's a cpu limit)
# use bjob -l $jobid
###########################################################
sub f_run_limit
{
$l_run_limit = "";
foreach $l_ligne (@bjobsl_out)
{
if ($l_ligne =~ /[0-9] min of/)
{
@l_word = split(/ +/,$l_ligne) ;
$l_run_limit = $l_word[1];
# if cpu limit and run limit
if ($l_word[5] =~ /[0-9]/)
{
$l_run_limit = $l_word[5];
}
last;
}
}
$l_sec=0;
$l_min=0;
$l_hour=0;
if ( $l_run_limit >= 60 )
{
$l_hour = $l_run_limit / 60;
$l_hour = int($l_hour);
$l_min = $l_run_limit - ($l_hour * 60);
} else
{
$l_hour = 0;
$l_min = int($l_run_limit);
}
return ($l_run_limit,$l_hour,$l_min);
}
########################################################
# f_nb_proc
# number of processors of the job
# search Processors in paragraph Submitted from host
# use bjobs -l $jobid$
##########################################################
sub f_nb_proc
{
$l_nb_cpu = "";
$l_paragraph = "";
$l_lignes_proc = false;
foreach $l_ligne (@bjobsl_out)
{
# if no empty ligne and in good paragraph
if (($l_ligne =~ /[\w\d]/) & ($l_lignes_proc eq true))
{
# delete space at the begining
$l_ligne =~ s/^\s+//g;
$l_ligne =~ s/,/, /g;
$l_ligne =~ s/Processors/Processors /;
# delete RC
chop($l_ligne);
# add each ligne of the paragraph
$l_paragraph = "$l_paragraph" . "$l_ligne";
}
# empty ligne: end of the good paragraph and search nb of proc
elsif (($l_ligne =~ /^$/) & ($l_lignes_proc eq true))
{ $l_lignes_proc = false;}
# enter in good paragraph
elsif (($l_ligne =~ /Submitted from host/) && ($l_lignes_proc eq false))
{
chop($l_ligne);
$l_paragraph = $l_ligne;
$l_paragraph =~ s/,/, /g;
$l_paragraph =~ s/Processors/Processors /;
$l_lignes_proc = true;
} # Submitted from host
} # foreach ligne
$l_lignes_proc = false;
@l_words = split(/ +/,$l_paragraph);
foreach $l_word (@l_words)
{
if ($l_word eq Processors)
{
$l_nb_cpu = $l_prev_word;
last;
}
elsif ($l_word =~ /Processors/)
{
$l_nb_cpu = $l_word;
$l_nb_cpu =~ s/Processors//;
last;
} # fi $word ~ Processors
$l_prev_word = $l_word;
} # foreach
if ( ! $l_nb_cpu )
{
$l_nb_cpu = 1;
}
return $l_nb_cpu;
}
########################################################
# f_hog_factor
# hog factor = run time / turnaround time
# turnaround time = PEND + PSUSP + RUN + USUSP + SSUSP
# use bhist $jobid
########################################################
sub f_hog_factor
{
$l_total_time = 0;
$l_run_time = 0;
foreach $l_ligne (@bhist_out)
{
if ($l_ligne =~ /^[0-9]{3,}/)
{
# for not modify @bhist_out
$l_copy = $l_ligne;
# delete job name because space caracter
$l_copy =~ s/.{25}//;
@l_word = split(/ +/,$l_copy);
$l_run_time = $l_word[3];
if ($l_run_time eq "") { $l_run_time = 0; }
$l_total_time = $l_word[7];
if ($l_total_time eq "") { $l_total_time = 1; }
last;
} # fi
} # foreach $l_ligne
if ($l_total_time > 0)
{
$l_hog_factor = 100 * $l_run_time / $l_total_time;
if ( $l_hog_factor > 999.9 )
{ $l_hog_factor = 999.9;
}
} else { # $l_total_time = 0
$l_hog_factor = 999.9;
}
return $l_hog_factor;
}
#
#
#########################################################################
__END__
=head1 NAME
bmonitor - display information about LSF jobs and hosts
=head1 DESCRIPTION
bmonitor is a perl script to monitor LSF jobs.
Every minute the script show for each jobs useful informations.
This script use LSF (Load Sharing Facility).
=head1 README
bmonitor is a perl script to monitor LSF jobs.
Every minute the script show for each jobs useful informations:
CPU : number of processors asked par the users
MEM : memory used by the job (MB)
SWAP : swap used by the job (MB)
CPUTIM : cpu time of the job (hh:mm)
JOB : number identification of the job
USER : user login
STAT : status of the job (PEND, PSUSP, USUSP, SSUSP, RUN)
QUEUE : name of the queue
RUN/WA : run ou wait time, time spent by the job in RUN status for
runing job or time spent in PEND status for pending job (hh:mm)
RUNLIM : maximun run time value asked by the user (hh:mm)
EFF : cpu time / (run time * number of proc)
HOG : run time / total time the job spend in LSF
EXECHOST : execution host
SUBMIT_TIME : date of the soumission
The options of bjob can be use with bmonitor.
This perl script is disponible from
http://cch.loria.fr/LSF/bmonitor/
and use the GNU General Public License.
For more informations send a mail to
[email protected].
Centre Charles Hermite/LORIA - Nancy - France
http://cch.loria.fr/
http://www.loria.fr/~rossi
=head1 AUTHOR
Christian rossi <
[email protected]>
=pod OSNAMES
Unix
=pod SCRIPT CATEGORIES
UNIX/System_administration
=cut