Difference between revisions of "Monitoring Script"
From PDP/Grid Wiki
Jump to navigationJump to searchm |
m |
||
| (4 intermediate revisions by the same user not shown) | |||
| Line 1: | Line 1: | ||
| − | The following script checks all the WNs, in order to look for jobs that are not running anymore. | + | The following script checks all the WNs, in order to look for jobs that are not running anymore. |
| − | FILE=" | + | So far only those jobs that have been sent a SIGKILL are deleted; and an email is still sent. If the result of a tracejob is showed in the email, then there could be another reason to mark a job as eligible to be deleted: |
| − | + | ||
| − | + | #!/bin/bash | |
| − | i=0 | + | wn_list=`pbsnodes -a | grep " wn-" | cut -f9 -d' '` #List of WNs |
| + | FILE="/tmp/Monitoring_Results_`date +%k%M%d%m%y`" #Results file, sent by email | ||
| + | wn_result="" #Diagnosis of a WN | ||
| + | jobidlist="" #List of jobs to delete | ||
| + | i=0 #Counter of WN with jobs to delete | ||
| + | jobresult="" #Result of tracejob | ||
| + | |||
echo "===============================================================================" >> "$FILE" | echo "===============================================================================" >> "$FILE" | ||
echo "=========================== AFFECTED WN DIAGNOSIS =============================" >> "$FILE" | echo "=========================== AFFECTED WN DIAGNOSIS =============================" >> "$FILE" | ||
echo "===============================================================================" >> "$FILE" | echo "===============================================================================" >> "$FILE" | ||
| − | for wn in ${wn_list[@]} | + | for wn in ${wn_list[@]} #Exploring the wn list |
| − | |||
| − | |||
| − | |||
| − | |||
| − | |||
| − | |||
| − | |||
| − | |||
| − | |||
| − | |||
| − | |||
| − | |||
| − | |||
| − | |||
| − | |||
| − | |||
do | do | ||
| − | + | wn_result=`momctl -d2 -h $wn` | |
| − | + | jobidlist=`echo $wn_result | grep "sidlist= " ` | |
| − | + | if [ ${#jobidlist} -ne 0 ] #Is there any job to be deleted? | |
| − | + | then | |
| − | + | echo $wn_result >> "$FILE" | |
| − | + | echo "_______________________________________________________________________________" >> "$FILE" | |
| − | + | job_id_list=`echo $jobidlist | sed -e 's/\[/\n/g' | grep "sidlist= " | cut -f1 -d'.'` | |
| + | i=`expr $i + 1` | ||
| + | for job_id in ${job_id_list[@]} #Exploring the jobs | ||
| + | do | ||
| + | echo $job_id | ||
| + | jobresult=`tracejob -n 8 -q $job_id` | ||
| + | if echo ${jobresult} | grep "SIGKILL" > /dev/null #A SIGKILL signal was sent | ||
| + | then | ||
| + | job_id="$job_id.stro.nikhef.nl" | ||
| + | echo "The job $job_id can be deleted" >> "$FILE" | ||
| + | momctl -h $wn -c $job_id #Deleting the job... | ||
| + | echo "The job $job_id has been deleted from the queue and the WN $wn" >> "$FILE" | ||
| + | else | ||
| + | echo $jobresult >> "$FILE" #This could give another reason to delete the job | ||
| + | fi | ||
| + | echo "_______________________________________________________________________________" >> "$FILE" | ||
| + | done | ||
| + | echo "_______________________________________________________________________________" >> "$FILE" | ||
| + | echo "_______________________________________________________________________________" >> "$FILE" | ||
| + | fi | ||
done | done | ||
| − | mail -s "Monitoring Results" | + | if [ $i -eq 0 ] |
| + | then | ||
| + | echo "There are no jobs to delete" >> "$FILE" | ||
| + | fi | ||
| + | mail -s "Monitoring Results" <EMAIL_ADDRESS> < "$FILE" #<EMAIL_ADDRESS> has to be edited | ||
| + | rm $FILE | ||
Latest revision as of 14:08, 25 January 2010
The following script checks all the WNs, in order to look for jobs that are not running anymore.
So far only those jobs that have been sent a SIGKILL are deleted; and an email is still sent. If the result of a tracejob is showed in the email, then there could be another reason to mark a job as eligible to be deleted:
#!/bin/bash
wn_list=`pbsnodes -a | grep " wn-" | cut -f9 -d' '` #List of WNs
FILE="/tmp/Monitoring_Results_`date +%k%M%d%m%y`" #Results file, sent by email
wn_result="" #Diagnosis of a WN
jobidlist="" #List of jobs to delete
i=0 #Counter of WN with jobs to delete
jobresult="" #Result of tracejob
echo "===============================================================================" >> "$FILE"
echo "=========================== AFFECTED WN DIAGNOSIS =============================" >> "$FILE"
echo "===============================================================================" >> "$FILE"
for wn in ${wn_list[@]} #Exploring the wn list
do
wn_result=`momctl -d2 -h $wn`
jobidlist=`echo $wn_result | grep "sidlist= " `
if [ ${#jobidlist} -ne 0 ] #Is there any job to be deleted?
then
echo $wn_result >> "$FILE"
echo "_______________________________________________________________________________" >> "$FILE"
job_id_list=`echo $jobidlist | sed -e 's/\[/\n/g' | grep "sidlist= " | cut -f1 -d'.'`
i=`expr $i + 1`
for job_id in ${job_id_list[@]} #Exploring the jobs
do
echo $job_id
jobresult=`tracejob -n 8 -q $job_id`
if echo ${jobresult} | grep "SIGKILL" > /dev/null #A SIGKILL signal was sent
then
job_id="$job_id.stro.nikhef.nl"
echo "The job $job_id can be deleted" >> "$FILE"
momctl -h $wn -c $job_id #Deleting the job...
echo "The job $job_id has been deleted from the queue and the WN $wn" >> "$FILE"
else
echo $jobresult >> "$FILE" #This could give another reason to delete the job
fi
echo "_______________________________________________________________________________" >> "$FILE"
done
echo "_______________________________________________________________________________" >> "$FILE"
echo "_______________________________________________________________________________" >> "$FILE"
fi
done
if [ $i -eq 0 ]
then
echo "There are no jobs to delete" >> "$FILE"
fi
mail -s "Monitoring Results" <EMAIL_ADDRESS> < "$FILE" #<EMAIL_ADDRESS> has to be edited
rm $FILE