Difference between revisions of "Monitoring Script"
From PDP/Grid Wiki
Jump to navigationJump to searchm |
|||
(5 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
− | The following script checks all the WNs, in order to look for jobs that are not running anymore. | + | The following script checks all the WNs, in order to look for jobs that are not running anymore. |
− | FILE=" | + | So far only those jobs that have been sent a SIGKILL are deleted; and an email is still sent. If the result of a tracejob is showed in the email, then there could be another reason to mark a job as eligible to be deleted: |
− | + | ||
− | + | #!/bin/bash | |
− | i=0 | + | wn_list=`pbsnodes -a | grep " wn-" | cut -f9 -d' '` #List of WNs |
+ | FILE="/tmp/Monitoring_Results_`date +%k%M%d%m%y`" #Results file, sent by email | ||
+ | wn_result="" #Diagnosis of a WN | ||
+ | jobidlist="" #List of jobs to delete | ||
+ | i=0 #Counter of WN with jobs to delete | ||
+ | jobresult="" #Result of tracejob | ||
+ | |||
echo "===============================================================================" >> "$FILE" | echo "===============================================================================" >> "$FILE" | ||
echo "=========================== AFFECTED WN DIAGNOSIS =============================" >> "$FILE" | echo "=========================== AFFECTED WN DIAGNOSIS =============================" >> "$FILE" | ||
echo "===============================================================================" >> "$FILE" | echo "===============================================================================" >> "$FILE" | ||
− | for wn in ${wn_list[@]} | + | for wn in ${wn_list[@]} #Exploring the wn list |
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
− | |||
do | do | ||
− | + | wn_result=`momctl -d2 -h $wn` | |
− | + | jobidlist=`echo $wn_result | grep "sidlist= " ` | |
− | + | if [ ${#jobidlist} -ne 0 ] #Is there any job to be deleted? | |
− | + | then | |
− | + | echo $wn_result >> "$FILE" | |
− | + | echo "_______________________________________________________________________________" >> "$FILE" | |
− | + | job_id_list=`echo $jobidlist | sed -e 's/\[/\n/g' | grep "sidlist= " | cut -f1 -d'.'` | |
+ | i=`expr $i + 1` | ||
+ | for job_id in ${job_id_list[@]} #Exploring the jobs | ||
+ | do | ||
+ | echo $job_id | ||
+ | jobresult=`tracejob -n 8 -q $job_id` | ||
+ | if echo ${jobresult} | grep "SIGKILL" > /dev/null #A SIGKILL signal was sent | ||
+ | then | ||
+ | job_id="$job_id.stro.nikhef.nl" | ||
+ | echo "The job $job_id can be deleted" >> "$FILE" | ||
+ | momctl -h $wn -c $job_id #Deleting the job... | ||
+ | echo "The job $job_id has been deleted from the queue and the WN $wn" >> "$FILE" | ||
+ | else | ||
+ | echo $jobresult >> "$FILE" #This could give another reason to delete the job | ||
+ | fi | ||
+ | echo "_______________________________________________________________________________" >> "$FILE" | ||
+ | done | ||
+ | echo "_______________________________________________________________________________" >> "$FILE" | ||
+ | echo "_______________________________________________________________________________" >> "$FILE" | ||
+ | fi | ||
done | done | ||
− | mail -s "Monitoring Results" | + | if [ $i -eq 0 ] |
+ | then | ||
+ | echo "There are no jobs to delete" >> "$FILE" | ||
+ | fi | ||
+ | mail -s "Monitoring Results" <EMAIL_ADDRESS> < "$FILE" #<EMAIL_ADDRESS> has to be edited | ||
+ | rm $FILE |
Latest revision as of 14:08, 25 January 2010
The following script checks all the WNs, in order to look for jobs that are not running anymore.
So far only those jobs that have been sent a SIGKILL are deleted; and an email is still sent. If the result of a tracejob is showed in the email, then there could be another reason to mark a job as eligible to be deleted:
#!/bin/bash wn_list=`pbsnodes -a | grep " wn-" | cut -f9 -d' '` #List of WNs FILE="/tmp/Monitoring_Results_`date +%k%M%d%m%y`" #Results file, sent by email wn_result="" #Diagnosis of a WN jobidlist="" #List of jobs to delete i=0 #Counter of WN with jobs to delete jobresult="" #Result of tracejob echo "===============================================================================" >> "$FILE" echo "=========================== AFFECTED WN DIAGNOSIS =============================" >> "$FILE" echo "===============================================================================" >> "$FILE" for wn in ${wn_list[@]} #Exploring the wn list do wn_result=`momctl -d2 -h $wn` jobidlist=`echo $wn_result | grep "sidlist= " ` if [ ${#jobidlist} -ne 0 ] #Is there any job to be deleted? then echo $wn_result >> "$FILE" echo "_______________________________________________________________________________" >> "$FILE" job_id_list=`echo $jobidlist | sed -e 's/\[/\n/g' | grep "sidlist= " | cut -f1 -d'.'` i=`expr $i + 1` for job_id in ${job_id_list[@]} #Exploring the jobs do echo $job_id jobresult=`tracejob -n 8 -q $job_id` if echo ${jobresult} | grep "SIGKILL" > /dev/null #A SIGKILL signal was sent then job_id="$job_id.stro.nikhef.nl" echo "The job $job_id can be deleted" >> "$FILE" momctl -h $wn -c $job_id #Deleting the job... echo "The job $job_id has been deleted from the queue and the WN $wn" >> "$FILE" else echo $jobresult >> "$FILE" #This could give another reason to delete the job fi echo "_______________________________________________________________________________" >> "$FILE" done echo "_______________________________________________________________________________" >> "$FILE" echo "_______________________________________________________________________________" >> "$FILE" fi done if [ $i -eq 0 ] then echo "There are no jobs to delete" >> "$FILE" fi mail -s "Monitoring Results" <EMAIL_ADDRESS> < "$FILE" #<EMAIL_ADDRESS> has to be edited rm $FILE