#!/bin/bash # This script runs at a worker node (Torque client) and executes a # configurable action when there are no active jobs. Upon completion, # it marks the action as done and puts the node on-line again. # Typical usage is invocation from a cron job. # defaults can be overridden VERBOSE=${VERBOSE:-0} LOGDIR=${LOGDIR:-/var/adm} PROCREQFILE=${PROCREQFILE:-$LOGDIR/when_idle_requests.log} REQFILE=${REQFILE:-/var/tmp/when_idle.conf} PBSNODES=${PBSNODES:-/usr/bin/pbsnodes} WHEN_IDLE_FLAG=${WHEN_IDLE_FLAG:-"when_idle"} CLEAR_DELAY=${CLEAR_DELAY:-60} [ -d $LOGIDR ] || mkdir -p $LOGDIR PROCREQDIR=`dirname $PROCREQFILE` [ -d $PROCREQDIR ] || mkdir -p $PROCREQDIR [ -f $PROCREQFILE ] || touch $PROCREQFILE host=`hostname -f` # optional argument specifies script if [ $# -gt 0 ]; then if [ ! -f $1 ]; then echo "Cannot read file: $REQFILE" exit 1 fi REQFILE=$1 shift fi # Check that the script should do anything: # - node is off line # - there are no jobs managed by Torque running # - there is a configuration file that has not been executed before # First, get the info about this node node_info=`$PBSNODES -a $host` if [ $? -ne 0 ]; then echo "Failed to retrieve MOM state information for PBS server" exit 3 fi # Is the node currently offline? echo $node_info | grep -q -e "state =.*offline" if [ $? -eq 0 ]; then [ $VERBOSE -gt 0 ] && echo "Node is offline" else [ $VERBOSE -gt 0 ] && echo "Node is not offline" exit 2 fi # Is the node offline with the "when_idle" flag? echo $node_info | grep -q -e "note =.*$WHEN_IDLE_FLAG" if [ $? -eq 0 ]; then [ $VERBOSE -gt 0 ] && echo "Node has flag set" else [ $VERBOSE -gt 0 ] && echo "Node does not have flag set" exit 2 fi # Is the node idle? echo $node_info | grep -q "jobs =" if [ $? -ne 0 ]; then [ $VERBOSE -gt 0 ] && echo "Node is idle" else [ $VERBOSE -gt 0 ] && echo "Node is not idle" exit 2 fi # Calculate checksum to verify that job has not been executed before checksum=`md5sum $REQFILE | cut -f1 -d" "` grep -q "$checksum" $PROCREQFILE if [ $? -eq 0 ]; then [ $VERBOSE -gt 0 ] && echo Config file with MD5SUM $checksum already processed exit 2 fi [ $VERBOSE -gt 0 ] && echo Config file with MD5SUM $checksum not yet processed # Run job and add its checksum to the list of processed files TMPDIR=/tmp tmpfile=`mktemp` echo "Node is idle, processing $REQFILE" > $tmpfile sh $REQFILE >> $tmpfile if [ $? -eq 0 ]; then #$PBSNODES -c $host # put node back on-line echo $checksum >> $PROCREQFILE # append checksum of config file logger -p user.notice -f $tmpfile # log the action and output # Put node back online and clear the idle flag sleep $CLEAR_DELAY logger -p user.notice "Putting node $host back online" $PBSNODES -c -N "" $host fi /bin/rm -f $tmpfile exit 0