3 # Run commands through a PBS queue without having to remember anything. :p
5 # This is an attempt to scale qcmd functionality to multiple commands.
6 # The goal is to be able to spawn a bunch of PBS jobs from the command line or
7 # a script, and then wait until they all finish before carrying on. Basically
8 # the bash 'wait' functionality for background commands, except for PBS jobs
9 # spawned by your particular process.
11 # This script doesn't bother with keeping stdin and stdout open for each job,
12 # since that could be a lot of file descriptors. It also doesn keep a process
13 # going for each job. Instead, we fire up qcmds in the backround attached to
14 # an input and output fifo. New job commands get piped into the input fifo,
15 # and job completion information gets piped out the output fifo. When all the
16 # jobs that qcmds has started have completed, qcmds closes it's output fifo.
18 # qcmds uses endlines ('\n') as it's job-delimiters when reading it's input,
19 # so you can only use single-line jobs. They can be long lines though ;).
21 # Simple usage would be:
22 # echo 'sleep 5 && echo hi there' | qcmds
23 # Common usage would be:
26 # $QCMDS=$((while [ $i -lt 1000 ]
28 # echo "sleep 10 && echo $i"
32 # wait # wait for qcmds tail to finish
35 # Downsides to the current implementation:
36 # You can only append to your $MAIL file while this is running. I'm using a
37 # .procmailrc file to deflect PBS-related mail to a seperate mailbox, so I can
38 # still edit my system mailbox while this script is running, but that's one
39 # more thing you'd have to set up...
41 # Another drawback is that you can't run qsub from a screen session, but that
42 # is just qsub in general. Sigh.
44 # For small numbers of jobs where the scripting overhead of a seperate process
45 # and fifos seems excessive, take a look at the more memory intensive 'qcmd'.
47 # For a nice introduction to bash arrays, see
48 # http://tldp.org/LDP/abs/html/arrays.html
50 # see ~/script/.test/t_fifo_readline for a demonstration of the `threading'
53 # This script uses process substitution which is a non-POSIX bash feature.
60 # addjob() ---(submit job with qsub)---> Job-Queue
62 # +----+ (job-complete email)
66 # getnextmail() <---(tail -f $MAIL)--- Mailbox
69 MAIL=$HOME/.mailspool/completed
70 # I have a ~/.procmailrc filter forwarding my PBS mail bodies to this $MAIL
71 # If you don't, comment the line out so you monitor your system $MAIL.
73 JOBS_OUTSTANDING=( ) # Store job ids that haven't completed yet
75 MAIL_BODY="" # Store the return of getnextmail()
80 > $DEBUGFILE # clear $DEBUGFILE if it existed before (Warning: clobber)
85 # functions for the job spawning subshell
90 echo running: $CMMD >> $DEBUGFILE
91 SCRIPT="cd \$PBS_O_WORKDIR && source dup_env && $CMMD"
92 # dup_env is in ~/bin even though it's a script
93 # since qsub creates it's own evironment, and moves X -> PBS_O_X
94 # for example, PATH -> PBS_O_PATH.
95 # who knows why it does this...
96 JOBID=`echo $SCRIPT | qsub -mae || exit 1`
97 # -mae : Send mail on abort or execute
98 #JOBNAME=STDIN # the qsub default for scripts piped into qsub
99 echo spawner: started $JOBID >> $DEBUGFILE
100 echo $JOBID # >> $SPAWN_TO_CHECK
104 # functions for the job checking loop
106 check4new() # blocks for < 1 second, fd 3 is addjob() output
108 read -t1 JOBID <&3 || return 1 # nothing new.
109 # add job to our outstanding list
110 JOBNUM=`echo "$JOBID" | sed 's/[.].*//'`
111 JOBS_OUTSTANDING[$JOBNUM]=1
112 let "NUM_OUTSTANDING += 1"
113 echo "add new depend: $JOBID" >> $DEBUGFILE
114 # extra space ': $J' to align with addjob DEBUG message
119 # look for completion message bodies along the lines of:
120 # PBS Job Id: 206.n0.abax.physics.drexel.edu
125 # could also poll on ls.
126 # neither 'ls' or 'tail -f' busy loops seem to take measurable processor time.
127 getnextmail() # blocking, fd 4 is tail -f $MAIL output
132 echo "block on mail" >> $DEBUGFILE
133 while [ $DONE -eq 0 ] && read LINE <&4
135 if [ "${LINE:0:11}" == "PBS Job Id:" ]
137 # we're reading an email about a job.
138 #echo "in a job" >> $DEBUGFILE
143 #echo "getting mail: $LINE" >> $DEBUGFILE
144 if [ "${#LINE}" -eq 0 ]
146 #echo "matched blank line" # we're leaving the email about our job.
147 #echo "got mail" >> $DEBUGFILE
151 BODY=`echo -e "$BODY\n$LINE"`
154 echo "returning mail" >> $DEBUGFILE
162 JOBNUM=`echo "$JOBID" | sed 's/[.].*//'`
163 echo -n "were we waiting on $JOBID? " >> $DEBUGFILE
164 if [ -n "${JOBS_OUTSTANDING[$JOBNUM]}" ]
166 echo "yes" >> $DEBUGFILE
167 # Sometimes the email comes in BEFORE STDOUT and STDERR were written
169 JOBNAME=STDIN # the qsub default for scripts piped into qsub
170 STDOUT=$JOBNAME.o$JOBNUM
171 STDERR=$JOBNAME.e$JOBNUM
172 while [ ! -e "$STDOUT" ] || [ ! -e "$STDERR" ]
176 # end stupid Torque hack
177 # remove the outstanding entry from the array.
178 unset JOBS_OUTSTANDING[$JOBNUM]
179 let "NUM_OUTSTANDING -= 1"
180 return 0 # job complete :)
182 echo "no" >> $DEBUGFILE
183 return 1 # not one of our completing jobs
192 JOBNUM=`echo "$JOBID" | sed 's/[.].*//'`
193 JOBNAME=STDIN # the qsub default for scripts piped into qsub
194 STDOUT=$JOBNAME.o$JOBNUM
195 STDERR=$JOBNAME.e$JOBNUM
197 HOST=`echo "$BODY" | sed -n 's/Exec host: *//p'`
198 EXIT=`echo "$BODY" | sed -n 's/Exit_status=//p'`
199 WALLTIME=`echo "$BODY" | sed -n 's/resources_used.walltime=//p'`
200 SESSION=`echo "$BODY" | sed -n 's/session_id=//p'`
202 echo -e "$JOBID\t$JOBNUM\t$EXIT\t$STDOUT\t$STDERR\t$HOST\t$SESSION\t$WALLTIME"
203 echo -e "$JOBID\t$JOBNUM\t$EXIT\t$STDOUT\t$STDERR\t$HOST\t$SESSION\t$WALLTIME" >> $DEBUGFILE
213 echo "start the spawning subshell" >> $DEBUGFILE
215 # job-spawning subshell, in a subshell so we can't access it's variables
216 # we send this script's stdin into the subshell as it's stdin (< /dev/stdin)
217 # and we open the subshell's output for reading on file descriptor 3
218 # with 'exec 3< <(subshell cmmsds)' (process substitution, see 'man bash').
220 echo "spawner: about to read" >> $DEBUGFILE
228 # use process substitution to keep tail running between reads, see 'man bash'
229 exec 4< <(tail -f $MAIL --pid $$) # open the MAIL file for reading on fd 4
230 # $$ expands to this script's PID
231 # --pid $$ sets up tail to die when this script exits, see 'man tail'
233 echo "loop on outstanding jobs" >> $DEBUGFILE
235 # email checking loop, in the forground
236 check4new # make sure there is an outstanding job first...
237 while [ $NUM_OUTSTANDING -gt 0 ]
240 while check4new; do nothing; done; # clean out the pipe
241 JOBID=`echo "$MAIL_BODY" | sed -n 's/PBS Job Id: *//p'`
242 if check4done "$JOBID"
244 printjobinfo $JOBID "$MAIL_BODY"
246 echo "still $NUM_OUTSTANDING job(s) outstanding" >> $DEBUGFILE
251 echo "cleanup" >> $DEBUGFILE
253 exec 3>&- # close the connection from the job-spawning subshell
255 # the tail dies automatically because of the --pid argument
256 # we'll leave the tail file descriptor open just in case
257 # tail tries to print something else before it dies.
259 echo "qcmds complete" >> $DEBUGFILE