Difference between revisions of "Monitoring Script"

From PDP/Grid Wiki
Jump to navigationJump to search
Line 2: Line 2:
  
 
   #!/bin/bash
 
   #!/bin/bash
 
 
   wn_list=`pbsnodes -a | grep " wn-" | cut -f9 -d' '`                            #List of WNs
 
   wn_list=`pbsnodes -a | grep " wn-" | cut -f9 -d' '`                            #List of WNs
 
   FILE="/tmp/monitoring_script/results/Monitoring_Results_`date +%k%M%d%m%y`"    #Results file, sent by email
 
   FILE="/tmp/monitoring_script/results/Monitoring_Results_`date +%k%M%d%m%y`"    #Results file, sent by email

Revision as of 16:14, 3 December 2009

The following script checks all the WNs, in order to look for jobs that are not running anymore. So far the jobs are not deleted, but an email is sent as an alert. After knowing the job id, it gives extra information about it via 'tracejob':

 #!/bin/bash
 wn_list=`pbsnodes -a | grep " wn-" | cut -f9 -d' '`                             #List of WNs
 FILE="/tmp/monitoring_script/results/Monitoring_Results_`date +%k%M%d%m%y`"     #Results file, sent by email
 wn_result=""                                                                    #Diagnosis of a WN
 jobidlist=""                                                                    #List of jobs to delete
 i=0                                                                             #Counter of WN with jobs to delete
 jobresult=""                                                                    #Result of tracejob
 
 echo "===============================================================================" >> "$FILE"
 echo "=========================== AFFECTED WN DIAGNOSIS =============================" >> "$FILE"
 echo "===============================================================================" >> "$FILE"
 for wn in ${wn_list[@]}                                                         #Exploring the wn list
 do
         wn_result=`momctl -d2 -h $wn`
         jobidlist=`echo $wn_result | grep "sidlist= " `
         if [ ${#jobidlist} -ne 0 ]                                              #Is there any job to be deleted?
         then
                 echo $wn_result >> "$FILE"
                 echo "_______________________________________________________________________________" >> "$FILE"
                 job_id_list=`echo $jobidlist | sed -e 's/\[/\n/g' | grep "sidlist= " | cut -f1 -d'.'`
                 i=`expr $i + 1`
                 for job_id in ${job_id_list[@]}                                 #Exploring the jobs
                 do
                         echo $job_id
                         jobresult=`tracejob -n 8 -q $job_id`
                         if echo ${jobresult} | awk '/SIGKILL/' > /dev/null      #A SIGKILL signal was sent
                         then
                                 job_id="$job_id.stro.nikhef.nl"
                                 echo "The job $job_id can be deleted" >> "$FILE"
                                 momctl -h $wn -c $job_id                        #Deleting the job...
                                 echo "The job $job_id has been deleted from the queue and the WN $wn" >> "$FILE"
                         else
                                 echo $jobresult >> "$FILE"                      #This could give another reason to delete the job
                         fi
                         echo "_______________________________________________________________________________" >> "$FILE"
                 done
                 echo "_______________________________________________________________________________" >> "$FILE"
                 echo "_______________________________________________________________________________" >> "$FILE"
         fi
 done
 if [ $i -eq 0 ]
 then
         echo "There are no jobs to delete" >> "$FILE"
 fi