#!/bin/bash # # Run commands through a PBS queue without having to remember anything. :p # # This is an attempt to scale qcmd functionality to multiple commands. # The goal is to be able to spawn a bunch of PBS jobs from the command line or # a script, and then wait until they all finish before carrying on. Basically # the bash 'wait' functionality for background commands, except for PBS jobs # spawned by your particular process. # # This script doesn't bother with keeping stdin and stdout open for each job, # since that could be a lot of file descriptors. It also doesn keep a process # going for each job. Instead, we fire up qcmds in the backround attached to # an input and output fifo. New job commands get piped into the input fifo, # and job completion information gets piped out the output fifo. When all the # jobs that qcmds has started have completed, qcmds closes it's output fifo. # # qcmds uses endlines ('\n') as it's job-delimiters when reading it's input, # so you can only use single-line jobs. They can be long lines though ;). # # Simple usage would be: # echo 'sleep 5 && echo hi there' | qcmds # Common usage would be: # #!/bin/bash # i=0 # $QCMDS=$((while [ $i -lt 1000 ] # do # echo "sleep 10 && echo $i" # let "i += 1" # done # ) | qcmds) # wait # wait for qcmds tail to finish # exit 0 # # Downsides to the current implementation: # You can only append to your $MAIL file while this is running. I'm using a # .procmailrc file to deflect PBS-related mail to a seperate mailbox, so I can # still edit my system mailbox while this script is running, but that's one # more thing you'd have to set up... # # Another drawback is that you can't run qsub from a screen session, but that # is just qsub in general. Sigh. # # For small numbers of jobs where the scripting overhead of a seperate process # and fifos seems excessive, take a look at the more memory intensive 'qcmd'. # # For a nice introduction to bash arrays, see # http://tldp.org/LDP/abs/html/arrays.html # # see ~/script/.test/t_fifo_readline for a demonstration of the `threading' # # Warning: # This script uses process substitution which is a non-POSIX bash feature. # # stdin # | # (list of jobs) # | # v # addjob() ---(submit job with qsub)---> Job-Queue # | | # +----+ (job-complete email) # | | # while: v | # checkfornew() v # getnextmail() <---(tail -f $MAIL)--- Mailbox DEBUG=0 MAIL=$HOME/.mailspool/completed # I have a ~/.procmailrc filter forwarding my PBS mail bodies to this $MAIL # If you don't, comment the line out so you monitor your system $MAIL. JOBS_OUTSTANDING=( ) # Store job ids that haven't completed yet NUM_OUTSTANDING=0 MAIL_BODY="" # Store the return of getnextmail() if [ $DEBUG -eq 1 ] then DEBUGFILE=qcmds.$$ > $DEBUGFILE # clear $DEBUGFILE if it existed before (Warning: clobber) else DEBUGFILE=/dev/null fi # functions for the job spawning subshell addjob () { CMMD=$* echo running: $CMMD >> $DEBUGFILE SCRIPT="cd \$PBS_O_WORKDIR && source dup_env && $CMMD" # dup_env is in ~/bin even though it's a script # since qsub creates it's own evironment, and moves X -> PBS_O_X # for example, PATH -> PBS_O_PATH. # who knows why it does this... JOBID=`echo $SCRIPT | qsub -mae || exit 1` # -mae : Send mail on abort or execute #JOBNAME=STDIN # the qsub default for scripts piped into qsub echo spawner: started $JOBID >> $DEBUGFILE echo $JOBID # >> $SPAWN_TO_CHECK return 0 } # functions for the job checking loop check4new() # blocks for < 1 second, fd 3 is addjob() output { read -t1 JOBID <&3 || return 1 # nothing new. # add job to our outstanding list JOBNUM=`echo "$JOBID" | sed 's/[.].*//'` JOBS_OUTSTANDING[$JOBNUM]=1 let "NUM_OUTSTANDING += 1" echo "add new depend: $JOBID" >> $DEBUGFILE # extra space ': $J' to align with addjob DEBUG message return 0 } # look for completion message bodies along the lines of: # PBS Job Id: 206.n0.abax.physics.drexel.edu # Job Name: STDIN # ... # <-- blank line # # could also poll on ls. # neither 'ls' or 'tail -f' busy loops seem to take measurable processor time. getnextmail() # blocking, fd 4 is tail -f $MAIL output { BODY="" DONE=0 INJOB=0 echo "block on mail" >> $DEBUGFILE while [ $DONE -eq 0 ] && read LINE <&4 do if [ "${LINE:0:11}" == "PBS Job Id:" ] then # we're reading an email about a job. #echo "in a job" >> $DEBUGFILE INJOB=1 fi if [ $INJOB -eq 1 ] then #echo "getting mail: $LINE" >> $DEBUGFILE if [ "${#LINE}" -eq 0 ] then #echo "matched blank line" # we're leaving the email about our job. #echo "got mail" >> $DEBUGFILE DONE=1 break fi BODY=`echo -e "$BODY\n$LINE"` fi done echo "returning mail" >> $DEBUGFILE MAIL_BODY="$BODY" return 0 } check4done() { JOBID=$1 JOBNUM=`echo "$JOBID" | sed 's/[.].*//'` echo -n "were we waiting on $JOBID? " >> $DEBUGFILE if [ -n "${JOBS_OUTSTANDING[$JOBNUM]}" ] then echo "yes" >> $DEBUGFILE # Sometimes the email comes in BEFORE STDOUT and STDERR were written # stupid Torque... JOBNAME=STDIN # the qsub default for scripts piped into qsub STDOUT=$JOBNAME.o$JOBNUM STDERR=$JOBNAME.e$JOBNUM while [ ! -e "$STDOUT" ] || [ ! -e "$STDERR" ] do sleep 0 done # end stupid Torque hack # remove the outstanding entry from the array. unset JOBS_OUTSTANDING[$JOBNUM] let "NUM_OUTSTANDING -= 1" return 0 # job complete :) fi echo "no" >> $DEBUGFILE return 1 # not one of our completing jobs } printjobinfo() { JOBID=$1 shift MAIL_BODY=$* JOBNUM=`echo "$JOBID" | sed 's/[.].*//'` JOBNAME=STDIN # the qsub default for scripts piped into qsub STDOUT=$JOBNAME.o$JOBNUM STDERR=$JOBNAME.e$JOBNUM HOST=`echo "$BODY" | sed -n 's/Exec host: *//p'` EXIT=`echo "$BODY" | sed -n 's/Exit_status=//p'` WALLTIME=`echo "$BODY" | sed -n 's/resources_used.walltime=//p'` SESSION=`echo "$BODY" | sed -n 's/session_id=//p'` echo -e "$JOBID\t$JOBNUM\t$EXIT\t$STDOUT\t$STDERR\t$HOST\t$SESSION\t$WALLTIME" echo -e "$JOBID\t$JOBNUM\t$EXIT\t$STDOUT\t$STDERR\t$HOST\t$SESSION\t$WALLTIME" >> $DEBUGFILE return 0 } nothing() { return 0 } echo "start the spawning subshell" >> $DEBUGFILE # job-spawning subshell, in a subshell so we can't access it's variables # we send this script's stdin into the subshell as it's stdin (< /dev/stdin) # and we open the subshell's output for reading on file descriptor 3 # with 'exec 3< <(subshell cmmsds)' (process substitution, see 'man bash'). exec 3< <( echo "spawner: about to read" >> $DEBUGFILE while read LINE do addjob $LINE done exit 0 ) < /dev/stdin # use process substitution to keep tail running between reads, see 'man bash' exec 4< <(tail -f $MAIL --pid $$) # open the MAIL file for reading on fd 4 # $$ expands to this script's PID # --pid $$ sets up tail to die when this script exits, see 'man tail' echo "loop on outstanding jobs" >> $DEBUGFILE # email checking loop, in the forground check4new # make sure there is an outstanding job first... while [ $NUM_OUTSTANDING -gt 0 ] do getnextmail while check4new; do nothing; done; # clean out the pipe JOBID=`echo "$MAIL_BODY" | sed -n 's/PBS Job Id: *//p'` if check4done "$JOBID" then printjobinfo $JOBID "$MAIL_BODY" fi echo "still $NUM_OUTSTANDING job(s) outstanding" >> $DEBUGFILE done wait echo "cleanup" >> $DEBUGFILE exec 3>&- # close the connection from the job-spawning subshell # the tail dies automatically because of the --pid argument # we'll leave the tail file descriptor open just in case # tail tries to print something else before it dies. echo "qcmds complete" >> $DEBUGFILE exit 0