Difference between revisions of "Monitoring Script"
From PDP/Grid Wiki
Jump to navigationJump to searchm |
|||
| (One intermediate revision by the same user not shown) | |||
| Line 5: | Line 5: | ||
#!/bin/bash | #!/bin/bash | ||
wn_list=`pbsnodes -a | grep " wn-" | cut -f9 -d' '` #List of WNs | wn_list=`pbsnodes -a | grep " wn-" | cut -f9 -d' '` #List of WNs | ||
| − | FILE="/tmp | + | FILE="/tmp/Monitoring_Results_`date +%k%M%d%m%y`" #Results file, sent by email |
wn_result="" #Diagnosis of a WN | wn_result="" #Diagnosis of a WN | ||
jobidlist="" #List of jobs to delete | jobidlist="" #List of jobs to delete | ||
| Line 28: | Line 28: | ||
echo $job_id | echo $job_id | ||
jobresult=`tracejob -n 8 -q $job_id` | jobresult=`tracejob -n 8 -q $job_id` | ||
| − | if echo ${jobresult} | | + | if echo ${jobresult} | grep "SIGKILL" > /dev/null #A SIGKILL signal was sent |
then | then | ||
job_id="$job_id.stro.nikhef.nl" | job_id="$job_id.stro.nikhef.nl" | ||
| Line 47: | Line 47: | ||
echo "There are no jobs to delete" >> "$FILE" | echo "There are no jobs to delete" >> "$FILE" | ||
fi | fi | ||
| − | mail -s "Monitoring Results" <EMAIL_ADDRESS> < "$FILE" #< | + | mail -s "Monitoring Results" <EMAIL_ADDRESS> < "$FILE" #<EMAIL_ADDRESS> has to be edited |
| + | rm $FILE | ||
Latest revision as of 14:08, 25 January 2010
The following script checks all the WNs, in order to look for jobs that are not running anymore.
So far only those jobs that have been sent a SIGKILL are deleted; and an email is still sent. If the result of a tracejob is showed in the email, then there could be another reason to mark a job as eligible to be deleted:
#!/bin/bash
wn_list=`pbsnodes -a | grep " wn-" | cut -f9 -d' '` #List of WNs
FILE="/tmp/Monitoring_Results_`date +%k%M%d%m%y`" #Results file, sent by email
wn_result="" #Diagnosis of a WN
jobidlist="" #List of jobs to delete
i=0 #Counter of WN with jobs to delete
jobresult="" #Result of tracejob
echo "===============================================================================" >> "$FILE"
echo "=========================== AFFECTED WN DIAGNOSIS =============================" >> "$FILE"
echo "===============================================================================" >> "$FILE"
for wn in ${wn_list[@]} #Exploring the wn list
do
wn_result=`momctl -d2 -h $wn`
jobidlist=`echo $wn_result | grep "sidlist= " `
if [ ${#jobidlist} -ne 0 ] #Is there any job to be deleted?
then
echo $wn_result >> "$FILE"
echo "_______________________________________________________________________________" >> "$FILE"
job_id_list=`echo $jobidlist | sed -e 's/\[/\n/g' | grep "sidlist= " | cut -f1 -d'.'`
i=`expr $i + 1`
for job_id in ${job_id_list[@]} #Exploring the jobs
do
echo $job_id
jobresult=`tracejob -n 8 -q $job_id`
if echo ${jobresult} | grep "SIGKILL" > /dev/null #A SIGKILL signal was sent
then
job_id="$job_id.stro.nikhef.nl"
echo "The job $job_id can be deleted" >> "$FILE"
momctl -h $wn -c $job_id #Deleting the job...
echo "The job $job_id has been deleted from the queue and the WN $wn" >> "$FILE"
else
echo $jobresult >> "$FILE" #This could give another reason to delete the job
fi
echo "_______________________________________________________________________________" >> "$FILE"
done
echo "_______________________________________________________________________________" >> "$FILE"
echo "_______________________________________________________________________________" >> "$FILE"
fi
done
if [ $i -eq 0 ]
then
echo "There are no jobs to delete" >> "$FILE"
fi
mail -s "Monitoring Results" <EMAIL_ADDRESS> < "$FILE" #<EMAIL_ADDRESS> has to be edited
rm $FILE