--- /dev/null
+#!/bin/bash
+#
+# Run commands through a PBS queue without having to remember anything. :p
+#
+# This script keeps stdin and stdout open until the PBS job returns, monitoring
+# your email for the appropriate job-completion message from the PBS server.
+# This lets you bump a long-executing task onto a compute node, while keeping
+# the lightweight part of your processing on the master node, for easier
+# connection and dependency checking.
+#
+# Downsides to the current implementation:
+# Blocks on email with a subshell (the piped input while loop) and tail
+# running, so counting the script itself, that's 3 processes on the head node
+# for each PBS job you're waiting for. While that's not a really big deal for
+# a few dozen jobs, it eats up memory fairly quickly for thousands of jobs.
+#
+# Another drawback is that you can only append to your $MAIL file while this is
+# running. I'm using a .procmailrc file to deflect PBS-related mail to a
+# seperate mailbox, so I can still edit my system mailbox while this script is
+# running, but that's one more thing you'd have to set up...
+#
+# Yet another drawback is that you can't run qsub from a screen session, but
+# that is just qsub in general. Sigh.
+#
+# The solution to the memory problem is to have a single script handle all
+# the spawning and waiting for multiple commands. Take a look at 'qcmds'
+#
+# usage: qcmd [-w[[H:]M:]S] command [arguments]
+# where -w optionally sets the wall time
+# for example:
+# $ ls .
+# file
+# $ qcmd cp file file2 && ls .
+# file
+# file2
+# $ diff file file2
+# (none)
+# or
+# $ qcmd pwd '&&' echo \$PBS_O_WORKDIR
+# /home/sysadmin/script
+# /home/sysadmin/script
+# and to prove we're running through qsub (2nd hostname is on calling system)
+# $ qcmd hostname && hostname
+# n8
+# n0
+#
+# Warning:
+# This script uses process substitution which is a non-POSIX bash feature.
+#
+# command line arguments
+# |
+# v
+# addjob() ---(submit job with qsub)---> Job-Queue
+# | |
+# +----+ (job-complete email)
+# | |
+# while: v |
+# checkfornew() v
+# getnextmail() <---(tail -f $MAIL)--- Mailbox
+
+DEBUG=0
+MAIL=$HOME/.mailspool/completed
+# I have a ~/.procmailrc filter forwarding my PBS mail bodies to this $MAIL
+# If you don't, comment the line out so you monitor your system $MAIL.
+
+JOB_OUTSTANDING= # Store the job id of our uncompleted job
+MAIL_BODY="" # Store the return of getnextmail()
+
+if [ $DEBUG -eq 1 ]
+ then
+ DEBUGFILE=qcmd.$$
+ > $DEBUGFILE # clear $DEBUGFILE if it existed before (Warning: clobber)
+else
+ DEBUGFILE=/dev/null
+fi
+
+# functions for job spawning
+
+addjob ()
+{
+ CMMD=$*
+ echo running: $CMMD >> $DEBUGFILE
+ SCRIPT="cd \$PBS_O_WORKDIR && source dup_env && $CMMD"
+ # dup_env is in ~/bin even though it's a script
+ # since qsub creates it's own evironment, and moves X -> PBS_O_X
+ # for example, PATH -> PBS_O_PATH.
+ # who knows why it does this...
+ JOBID=`echo $SCRIPT | qsub -mae $WALLTIME_OPTION || exit 1`
+ # -mae : Send mail on abort or execute
+ #JOBNAME=STDIN # the qsub default for scripts piped into qsub
+ echo spawner: started $JOBID >> $DEBUGFILE
+ JOB_OUTSTANDING="$JOBID"
+ echo "add new depend: $JOBID" >> $DEBUGFILE
+ return 0
+}
+
+# functions for the job checking loop
+
+# look for completion message bodies along the lines of:
+# PBS Job Id: 206.n0.abax.physics.drexel.edu
+# Job Name: STDIN
+# ...
+# <-- blank line
+#
+# could also poll on ls.
+# neither 'ls' or 'tail -f' busy loops seem to take measurable processor time.
+getnextmail() # blocking, fd 3 is tail -f $MAIL output
+{
+ BODY=""
+ DONE=0
+ INJOB=0
+ echo "block on mail" >> $DEBUGFILE
+ while [ $DONE -eq 0 ] && read LINE <&3
+ do
+ if [ "${LINE:0:11}" == "PBS Job Id:" ]
+ then
+ # we're reading an email about a job.
+ #echo "in a job" >> $DEBUGFILE
+ INJOB=1
+ fi
+ if [ $INJOB -eq 1 ]
+ then
+ #echo "getting mail: $LINE" >> $DEBUGFILE
+ if [ "${#LINE}" -eq 0 ]
+ then
+ #echo "matched blank line" # we're leaving the email about our job.
+ #echo "got mail" >> $DEBUGFILE
+ DONE=1
+ break
+ fi
+ BODY=`echo -e "$BODY\n$LINE"`
+ fi
+ done
+ echo "returning mail" >> $DEBUGFILE
+ MAIL_BODY="$BODY"
+ return 0
+}
+
+check4done()
+{
+ JOBID=$1
+ i=0
+ echo -n "were we waiting on $JOBID? " >> $DEBUGFILE
+ if [ "$JOBID" == "$JOB_OUTSTANDING" ]
+ then
+ # Sometimes the email comes in BEFORE STDOUT and STDERR were written
+ # stupid Torque...
+ JOBNUM=`echo "$JOBID" | sed 's/[.].*//'`
+ JOBNAME=STDIN # the qsub default for scripts piped into qsub
+ STDOUT=$JOBNAME.o$JOBNUM
+ STDERR=$JOBNAME.e$JOBNUM
+ while [ ! -e "$STDOUT" ] || [ ! -e "$STDERR" ]
+ do
+ sleep 0
+ done
+ # end stupid Torque hack
+
+ echo "yes" >> $DEBUGFILE
+ return 0 # job complete :)
+ fi
+ echo "no" >> $DEBUGFILE
+ return 1 # not one of our completing jobs
+}
+
+printjoboutput()
+{
+ JOBID=$1
+ shift
+ MAIL_BODY=$*
+
+ JOBNUM=`echo "$JOBID" | sed 's/[.].*//'`
+ JOBNAME=STDIN # the qsub default for scripts piped into qsub
+ STDOUT=$JOBNAME.o$JOBNUM
+ STDERR=$JOBNAME.e$JOBNUM
+
+ cat $STDOUT
+ cat $STDERR >&2
+ rm -f $STDOUT $STDERR
+
+ return 0
+}
+
+
+echo "spawning job" >> $DEBUGFILE
+
+if [ ${1:0:2} == "-w" ]
+then
+ WALLTIME=${1:2}
+ WALLTIME_OPTION="-l walltime=$WALLTIME"
+ shift
+fi
+
+addjob $*
+
+# use process substitution to keep tail running between reads, see 'man bash'
+exec 3< <(tail -f $MAIL --pid $$) # open the MAIL file for reading on fd 3
+# $$ expands to this script's PID
+# --pid $$ sets up tail to die when this script exits, see 'man tail'
+
+echo "loop on outstanding job" >> $DEBUGFILE
+
+# email checking loop, in the forground
+while [ -n "$JOB_OUTSTANDING" ]
+ do
+ getnextmail
+ JOBID=`echo "$MAIL_BODY" | sed -n 's/PBS Job Id: *//p'`
+ if check4done "$JOBID"
+ then
+ printjoboutput $JOBID "$MAIL_BODY"
+ EXIT=`echo "$BODY" | sed -n 's/Exit_status=//p'`
+ unset JOB_OUTSTANDING
+ fi
+done
+
+# the tail dies automatically because of the --pid argument
+# we'll leave the tail file descriptor open just in case
+# tail tries to print something else before it dies.
+
+echo "qcmd complete" >> $DEBUGFILE
+
+exit $EXIT