Monitoring Script

From PDP/Grid Wiki
Revision as of 16:17, 3 December 2009 by Fbernabe@nikhef.nl (talk | contribs)
Jump to navigationJump to search

The following script checks all the WNs, in order to look for jobs that are not running anymore.

So far only those jobs that have been sent a SIGKILL are deleted; and an email is still sent. If the result of a tracejob is showed in the email, then there could be another reason to mark a job as eligible to be deleted:

 #!/bin/bash
 wn_list=`pbsnodes -a | grep " wn-" | cut -f9 -d' '`                             #List of WNs
 FILE="/tmp/monitoring_script/results/Monitoring_Results_`date +%k%M%d%m%y`"     #Results file, sent by email
 wn_result=""                                                                    #Diagnosis of a WN
 jobidlist=""                                                                    #List of jobs to delete
 i=0                                                                             #Counter of WN with jobs to delete
 jobresult=""                                                                    #Result of tracejob
 
 echo "===============================================================================" >> "$FILE"
 echo "=========================== AFFECTED WN DIAGNOSIS =============================" >> "$FILE"
 echo "===============================================================================" >> "$FILE"
 for wn in ${wn_list[@]}                                                         #Exploring the wn list
 do
         wn_result=`momctl -d2 -h $wn`
         jobidlist=`echo $wn_result | grep "sidlist= " `
         if [ ${#jobidlist} -ne 0 ]                                              #Is there any job to be deleted?
         then
                 echo $wn_result >> "$FILE"
                 echo "_______________________________________________________________________________" >> "$FILE"
                 job_id_list=`echo $jobidlist | sed -e 's/\[/\n/g' | grep "sidlist= " | cut -f1 -d'.'`
                 i=`expr $i + 1`
                 for job_id in ${job_id_list[@]}                                 #Exploring the jobs
                 do
                         echo $job_id
                         jobresult=`tracejob -n 8 -q $job_id`
                         if echo ${jobresult} | awk '/SIGKILL/' > /dev/null      #A SIGKILL signal was sent
                         then
                                 job_id="$job_id.stro.nikhef.nl"
                                 echo "The job $job_id can be deleted" >> "$FILE"
                                 momctl -h $wn -c $job_id                        #Deleting the job...
                                 echo "The job $job_id has been deleted from the queue and the WN $wn" >> "$FILE"
                         else
                                 echo $jobresult >> "$FILE"                      #This could give another reason to delete the job
                         fi
                         echo "_______________________________________________________________________________" >> "$FILE"
                 done
                 echo "_______________________________________________________________________________" >> "$FILE"
                 echo "_______________________________________________________________________________" >> "$FILE"
         fi
 done
 if [ $i -eq 0 ]
 then
         echo "There are no jobs to delete" >> "$FILE"
 fi
mail -s "Monitoring Results" <EMAIL_ADDRESS> < "$FILE"                           #<EMAI_ADDRESS> has to be edited