From 2e8536997bad422521e936d1dd7441d3d25d47c7 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Mon, 18 Oct 2010 16:54:07 -0400 Subject: [PATCH] Introduce pysawsim in the README and add fit_force_histograms.py & friends. --- README | 13 +- pysawsim/fit_force_histograms.py | 394 +++++++++++++++++++++++++++++++ pysawsim/qwait | 15 ++ pysawsim/run_vel_dep | 97 ++++++++ pysawsim/vel_dep_graph | 23 ++ 5 files changed, 540 insertions(+), 2 deletions(-) create mode 100755 pysawsim/fit_force_histograms.py create mode 100755 pysawsim/qwait create mode 100755 pysawsim/run_vel_dep create mode 100755 pysawsim/vel_dep_graph diff --git a/README b/README index 7f86701..46340b8 100644 --- a/README +++ b/README @@ -1,8 +1,8 @@ Compiling --------- -Sawsim is written in noweb_ (`noweb`, or the transitional `nowebm`, in -Debian-based distributions). Extract Makefile and compile with: +Sawsim is written in noweb_ (`noweb` in Debian-based distributions). +Extract Makefile and compile with: $ notangle -Rmakefile src/sawsim.nw | sed 's/ /\t/' > Makefile $ make @@ -27,6 +27,15 @@ in Debian-based distributions). .. _check: http://check.sourceforge.net/ +Python wrapper +-------------- + +The `sawsim` binary only runs a single pull, but you'll probably want +to run many repetitions to generate enough data for significant +statistical analysis. To facilitate this, we provide the `pysawsim` +module which provides a higher level interface to `sawsim`. + + License ------- diff --git a/pysawsim/fit_force_histograms.py b/pysawsim/fit_force_histograms.py new file mode 100755 index 0000000..09e645c --- /dev/null +++ b/pysawsim/fit_force_histograms.py @@ -0,0 +1,394 @@ +#!/usr/bin/python + +import os # HACK, for getpid() +import os.path +import pickle +import numpy +from scipy.optimize import leastsq +import shutil +from subprocess import Popen, PIPE +import sys + +import matplotlib +matplotlib.use('Agg') # select backend that doesn't require X Windows +import pylab + +FIGURE = pylab.figure() # avoid memory problems. +# pylab keeps internal references to all created figures, so share a +# single instance. +MEM_DEBUG = False + +class CommandError(Exception): + def __init__(self, command, status, stdout, stderr): + strerror = ["Command failed (%d):\n %s\n" % (status, stderr), + "while executing\n %s" % command] + Exception.__init__(self, "\n".join(strerror)) + self.command = command + self.status = status + self.stdout = stdout + self.stderr = stderr + +def invoke(cmd_string, stdin=None, expect=(0,), cwd=None, verbose=True): + """ + expect should be a tuple of allowed exit codes. cwd should be + the directory from which the command will be executed. + """ + if cwd == None: + cwd = "." + if verbose == True: + print >> sys.stderr, "%s$ %s" % (cwd, cmd_string) + try : + if sys.platform != "win32" and False: + q = Popen(cmd_string, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=cwd) + else: + # win32 don't have os.execvp() so have to run command in a shell + q = Popen(cmd_string, stdin=PIPE, stdout=PIPE, stderr=PIPE, + shell=True, cwd=cwd) + except OSError, e : + raise CommandError(args, status=e.args[0], stdout="", stderr=e) + output,error = q.communicate(input=stdin) + status = q.wait() + if verbose == True: + print >> sys.stderr, "%d\n%s%s" % (status, output, error) + if status not in expect: + raise CommandError(args, status, output, error) + return status, output, error + +def rss(): + """ + For debugging memory usage. + + resident set size, the non-swapped physical memory that a task has + used (in kiloBytes). + """ + call = "ps -o rss= -p %d" % os.getpid() + status,stdout,stderr = invoke(call) + return int(stdout) + +class Histogram (object): + def from_data(self, data, bin_edges): + """ + All bins should be of equal width (so we can calculate which + bin a data point belongs to). + + data should be a numpy array. + """ + self.bin_edges = bin_edges + bin_width = self.bin_edges[1] - self.bin_edges[0] + + bin_is = numpy.floor((data - self.bin_edges[0])/bin_width) + self.counts = [] + for i in range(len(self.bin_edges)-1): + self.counts.append(sum(bin_is == i)) + self.total = float(len(data)) # because some data might be outside the bins + self.mean = data.mean() + self.std_dev = data.std() + def from_file(self, h_file): + """ + h_file format: + # comment and blank lines ignored + + ... + + e.g. + + 150e-12 10 + 200e-12 40 + 250e-12 5 + + The unfolding force should mark the left-hand side of the bin, and + all bins should be of equal width (so we know where the last one ends). + """ + self.bin_edges = [] + self.counts = [] + for line in file(h_file, "r").readlines(): + line = line.strip() + if len(line) == 0 or line[0] == "#": + continue # ignore blank lines and comments + force,count = line.split() #"\t") + self.bin_edges.append(float(force)) + self.counts.append(float(count)) + bin_width = self.bin_edges[1] - self.bin_edges[0] + self.bin_edges.append(self.counts[-1]+bin_width) + self.total = float(sum(self.counts)) + self.mean = 0 + for bin,count in zip(self.bin_edges, self.counts): + bin += bin_width / 2.0 + self.mean += bin * count + self.mean /= self.total + variance = 0 + for bin,count in zip(self.bin_edges, self.counts): + bin += bin_width / 2.0 + variance += (bin - self.mean)**2 * count + self.std_dev = numpy.sqrt(variance) + def normalize(self): + self.probabilities = [] + self.total = float(self.total) + for count in self.counts: + self.probabilities.append(count/self.total) + def residual(self, other, type='bin-height', plot=False, title=None): + """ + Each bin is weighted by the probability of unfolding in that + bin for the self histogram. For residual types where there is + a convention, this histogram is treated as the experimental + histogram and the other histogram is treated as the + theoretical one. + """ + if type in ['jensen-shannon', 'chi-squared']: + assert self.bin_edges == other.bin_edges + if type == 'jensen-shannon': + def d_KL_term(p,q): + """ + Kullback-Leibler divergence for a single bin, with the + exception that we return 0 if q==0, rather than + exploding like d_KL should. We can do this because + the way d_KL_term is used in the Jensen-Shannon + divergence, if q (really m) == 0, then p also == 0, + and the Jensen-Shannon divergence for the entire term + should be zero. + """ + if p==0 or q==0 or p==q: + return 0.0 + return p * numpy.log2(p/q) + residual = 0 + for probA,probB in zip(self.probabilities, other.probabilities): + m = (probA+probB) / 2.0 + residual += 0.5*(d_KL_term(probA,m) + d_KL_term(probB,m)) + elif type == 'chi-squared': + residual = 0 + for probA,probB in zip(self.probabilities, other.probabilities): + residual += (probA-probB)**2 / probB + elif type == 'mean': + residual = (other.mean - self.mean)**2 + elif type == 'std-dev': + residual = (other.std_dev - self.std_dev)**2 + else: + raise NotImplementedError, type + if plot == True: + self.plot_residual_comparison(other, residual, title) + return residual #*self.total # also weight by counts in our histogram + def plot_residual_comparison(self, other, residual=None, title=None): + if not hasattr(self, "_tag_i"): + self._tag_i = 0 + self._tag_i += 1 + + if residual == None: + residual = self.residual(other) + FIGURE.clear() + p = pylab.plot(self.bin_edges[:-1], self.probabilities, 'r-', + other.bin_edges[:-1], other.probabilities, 'b-') + if title != None: + pylab.title(title) + FIGURE.savefig("residual-%g-%d.png" % (residual, self._tag_i)) + + +class HistogramMatcher (object): + def __init__(self, velocity_file, param_format_string, dir_prefix="v_dep"): + self.experiment_histograms = self.read_force_histograms(velocity_file) + self.param_format_string = param_format_string + self.dir_prefix = dir_prefix + def read_force_histograms(self, v_file): + """ + v_file format: + # comment and blank lines ignored + + ... + + e.g. + + 5e-7 histA + 1e-6 histB + """ + histograms = {} + v_file_dir = os.path.dirname(v_file) + for line in file(v_file, "r").readlines(): + line = line.strip() + if len(line) == 0 or line[0] == "#": + continue # ignore blank lines and comments + v,h_file = line.split("\t") + h = Histogram() + h.from_file(os.path.join(v_file_dir, h_file)) + histograms[v] = h + histograms[v].normalize() + return histograms + def dirname(self, params): + if not hasattr(self, "_tag_i"): + self._tag_i = 0 + self._tag_i += 1 + return "%s-%d" % (self.dir_prefix, self._tag_i) + def generate_sawsim_histograms(self, params, histograms, + run_repeat_simulation=True): + velocities = histograms.keys() + dirname = self.dirname(params) + cmd = "run_vel_dep %s %s %s" \ + % (dirname, + ",".join([str(v) for v in velocities]), + self.param_string(params)) + if os.path.exists(dirname) and run_repeat_simulation == True: + print 'repeating simulation' + shutil.rmtree(dirname) + if not os.path.exists(dirname): + invoke(cmd) + sawsim_histograms = {} + for velocity in velocities: + unfolding_forces = self.get_all_unfolding_data(dirname, str(velocity)) + bin_edges = histograms[velocity].bin_edges + h = Histogram() + h.from_data(unfolding_forces, bin_edges) + sawsim_histograms[velocity] = h + sawsim_histograms[velocity].normalize() + return sawsim_histograms + def param_string(self, params): + """ + Generate a string of non-velocity options to pass to sawsim + for a given parameter list. + """ + return self.param_format_string % (params[0], params[1])#tuple(params) + def get_all_unfolding_data(self, dirname, velocity_string): + datafile = os.path.join(dirname, "data_" + velocity_string) + return numpy.fromfile(datafile, sep=" ") + def residual(self, params, type='bin-height', plot=False, + run_repeat_simulation=True): + sawsim_histograms = self.generate_sawsim_histograms( \ + params, self.experiment_histograms, + run_repeat_simulation=run_repeat_simulation) + assert sorted(sawsim_histograms.keys()) == sorted(self.experiment_histograms.keys()) + residual = 0 + for velocity,experiment_histogram in self.experiment_histograms.items(): + sawsim_histogram = sawsim_histograms[velocity] + title = None + if plot == True: + title = ", ".join(["%g" % p for p in params]) + residual += experiment_histogram.residual( \ + sawsim_histogram, type=type, plot=plot, title=title) + print "residual", residual + del(sawsim_histograms) + return residual + def fit(self, initial_params, verbose=False): + p,cov,info,mesg,ier = leastsq(self.residual, initial_params, + full_output=True, maxfev=1000) + if verbose == True: + print "Fitted params:",p + print "Covariance mx:",cov + print "Info:", info + print "mesg:", mesg + if ier == 1: + print "Solution converged" + else: + print "Solution did not converge" + return p + def plot(self, param_ranges, logx=False, logy=False, + residual_type='bin-height', + plot_residuals=False, contour=False, + run_repeat_simulations=True): + xranges = param_ranges[0] + yranges = param_ranges[1] + if logx == False: + x = numpy.linspace(*xranges) + else: + m,M,n = xranges + x = numpy.exp(numpy.linspace(numpy.log(m), numpy.log(M), n)) + if logy == False: + y = numpy.linspace(*yranges) + else: + m,M,n = yranges + y = numpy.exp(numpy.linspace(numpy.log(m), numpy.log(M), n)) + X, Y = pylab.meshgrid(x,y) + C = numpy.zeros((len(y)-1, len(x)-1)) + for i,xi in enumerate(x[:-1]): + for j,yi in enumerate(y[:-1]): + print i, j, i*(len(y)-1) + j, (len(x)-1)*(len(y)-1) + params = (xi,yi) + r = self.residual(params, type=residual_type, + plot=plot_residuals, + run_repeat_simulation=run_repeat_simulations) + C[j,i] = numpy.log(r) # better resolution in valleys + if MEM_DEBUG == True: + print >> sys.stderr, "RSS: %d KB" % rss() + C = numpy.nan_to_num(C) # NaN -> 0 + fid = file("fit_force_histograms-XYC.pkl", "wb") + pickle.dump([X,Y,C], fid) + fid.close() + # read in with + # import pickle + # [X,Y,C] = pickle.load(file("fit_force_histograms-XYC.pkl", "rb")) + # ... + FIGURE.clear() + axes = FIGURE.add_subplot(111) + if logx == True: + axes.set_xscale('log') + if logy == True: + axes.set_yscale('log') + if contour == True: + p = axes.contour(X[:-1,:-1], Y[:-1,:-1], C) + # [:-1,:-1] to strip dummy last row & column from X&Y. + else: # pseudocolor plot + p = axes.pcolor(X, Y, C) + axes.autoscale_view(tight=True) + FIGURE.colorbar(p) + FIGURE.savefig("figure.png") + +def parse_param_ranges_string(string): + """ + '[Amin,Amax,Asteps],[Bmin,Bmax,Bsteps],...' + -> + [[Amin,Amax,Asteps],[Bmin,Bmax,Bsteps],...] + """ + ranges = [] + for range_string in string.split("],["): + range_number_strings = range_string.strip("[]").split(",") + ranges.append([float(x) for x in range_number_strings]) + return ranges + +if __name__ == "__main__": + import optparse + usage = "%prog [options] velocity_file" + + parser = optparse.OptionParser(usage) + parser.add_option("-f","--param-format", dest="param_format", + metavar="FORMAT", + help="Convert params to sawsim options (%default).", + default=('-s cantilever,hooke,0.05 -N1 -s folded,null -N8 -s "unfolded,wlc,{0.39e-9,28e-9}" -k "folded,unfolded,bell,{%g,%g}" -q folded')) + parser.add_option("-p","--initial-params", dest="initial_params", + metavar="PARAMS", + help="Initial params for fitting (%default).", + default='3.3e-4,0.25e-9') + parser.add_option("-r","--param-range", dest="param_range", + metavar="PARAMS", + help="Param range for plotting (%default).", + default='[1e-5,1e-3,20],[0.1e-9,1e-9,20]') + parser.add_option("--logx", dest="logx", + help="Use a log scale for the x range.", + default=False, action="store_true") + parser.add_option("--logy", dest="logy", + help="Use a log scale for the y range.", + default=False, action="store_true") + parser.add_option("-d","--dir-prefix", dest="dir_prefix", + metavar="STRING", + help="Prefix to sawsim data directories (%default).", + default='v_dep') + parser.add_option("-R","--residual", dest="residual", + metavar="STRING", + help="Residual type (from 'jensen-shannon', 'chi-squared', 'mean', 'std-dev'; default: %default).", + default='jensen-shannon') + parser.add_option("-P","--plot-residuals", dest="plot_residuals", + help="Generate residual difference plots for each point in the plot range.", + default=False, action="store_true") + parser.add_option("-c","--contour-plot", dest="contour_plot", + help="Select contour plot (vs. the default pseudocolor plot).", + default=False, action="store_true") + parser.add_option("-n","--no-repeat-simulations", dest="simulations", + help="Do not run simulations if they already exist (re-analyze a previous run's output).", + default=True, action="store_false") + options,args = parser.parse_args() + initial_params = [float(p) for p in options.initial_params.split(",")] + param_ranges = parse_param_ranges_string(options.param_range) + velocity_file = args[0] + + hm = HistogramMatcher(velocity_file, options.param_format, options.dir_prefix) + #hm.fit(initial_params, verbose=True) + hm.plot(param_ranges, logx=options.logx, logy=options.logy, + residual_type=options.residual, + plot_residuals=options.plot_residuals, + contour=options.contour_plot, + run_repeat_simulations=options.simulations) diff --git a/pysawsim/qwait b/pysawsim/qwait new file mode 100755 index 0000000..6a3427b --- /dev/null +++ b/pysawsim/qwait @@ -0,0 +1,15 @@ +#!/bin/bash +# +# Wait for a job to finish execution + +JOBID="$1" + +while sleep 3; do + STAT=`qstat $JOBID 2>&1 1>/dev/null` + UNKOWN=`echo "$STAT" | grep "Unknown Job Id"` + if [ -n "$UNKOWN" ]; then + exit 0 # job has completed, since qstat doesn't recognize it. + fi +done + +exit 1 diff --git a/pysawsim/run_vel_dep b/pysawsim/run_vel_dep new file mode 100755 index 0000000..cb47846 --- /dev/null +++ b/pysawsim/run_vel_dep @@ -0,0 +1,97 @@ +#!/bin/bash +# +# run_vel_dep +# +# run BN*N simulations each at a series of pulling speeds V +# (non-velocity parameters are copied from the run_vel_dep command +# line) for each speed, save all unfolding data data_$V, and generate +# a histogram of unfolding forces hist_$V also generate a file v_dep +# with the mean unfolding force vs velocity. +# +# usage: run_vel_dep OUTPUT_DIR VS SAWSIM PARAMS... +# e.g. run_vel_dep v_dep-123xs \ +# 1e-9,1e-8,1e-7,1e-6,1e-5 \ +# -s cantilever,hooke,0.05 -N1 \ +# -s folded,null -N8 \ +# -s 'unfolded,wlc,{0.39e-9,28e-9}' \ +# -k 'folded,unfolded,bell,{3.3e-4,0.25e-9}' \ +# -q folded + +if [ $# -lt 3 ]; then + STAMP=`date +"%Y.%m.%d.%H.%M.%S"` + echo 'usage: run_vel_dep OUTPUT_DIR VS SAWSIM PARAMS...' + echo "e.g. run_vel_dep v_dep-$STAMP \\" + echo ' 1e-9,1e-8,1e-7,1e-6,1e-5 \' + echo ' -s cantilever,hooke,0.05 -N1 \' + echo ' -s folded,null -N8 \' + echo ' -s "unfolded,wlc,{0.39e-9,28e-9}" \' + echo ' -k "folded,unfolded,bell,{3.3e-4,0.25e-9}" \' + echo ' -q folded' + exit 1 +fi + +DIR="$1" +shift +Vs=`echo $1 | sed 's/,/ /g'` +shift +# Protect special chars from later shell expansion by quoting and +# escaping and " characters that may have been in $arg already. +Q='"' +EQ='\"' +SAWSIM_PARAMS=`for arg in $@; do echo -n "$Q${arg//$Q/$EQ}$Q "; done` + + +# We can't depend on the entire array in one shot, so +# batch executions into a single array instance +BN=20 # sawsim executions per array job instance +#N=8 # number of arrays +N=20 # number of arrays +SAFTEY_SLEEP=2 # seconds to sleep between spawning array job and depending on it + # otherwise dependencies don't catch + +mkdir "$DIR" || exit 1 +pushd "$DIR" > /dev/null +echo "Date "`date` > v_dep_notes +echo "Run$ sawsim $SAWSIM_PARAMS -v \$V" >> v_dep_notes +echo "for V in $Vs" >> v_dep_notes + +Is=`seq 1 $N` # a list of allowed 'i's, since 'for' more compact than 'while' + +vdone_condition="afterany" +VJOBIDS="" +for V in $Vs +do + # run N sawtooth simulations + idone_condition="afterany" + cmd="cd \$PBS_O_WORKDIR + for i in \`seq 1 $BN\`; do + ~/sawsim/sawsim $SAWSIM_PARAMS -v $V > run_$V-\$PBS_ARRAYID-\$i + done" + JOBID=`echo "$cmd" | qsub -h -t 1-$N -N "sawsim" -o run_$V.o -e run_$V.e` || exit 1 + # "3643.abax.physics.drexel.edu" --> "3643" + ARRAYID=`echo "$JOBID" | sed "s/\\([0-9]*\\)\\([.].*\\)/\\1/g"` + for i in $Is; do idone_condition="$idone_condition:$ARRAYID-$i"; done + + sleep $SAFTEY_SLEEP + # once they're done, compile a list of all unfolding forces for each speed + JOBID=`echo "cd \\\$PBS_O_WORKDIR && cat run_$V-* | grep -v '#' | cut -f1 > data_$V" |\ + qsub -h -W depend="$idone_condition" -N sawComp -o data_$V.o -e data_$V.e` \ + || exit 1 + vdone_condition="$vdone_condition:$JOBID" + VJOBIDS="$VJOBIDS $JOBID" + for i in $Is; do qrls "$ARRAYID-$i"; done +done + +# once we have force lists for each velocity, make histograms and F(V) chart +JOBID=`echo "cd \\\$PBS_O_WORKDIR && $HOME/script/vel_dep_graph $Vs" | \ + qsub -W depend="$vdone_condition" -N sawsimGraph` || exit 1 +# "3643.abax.physics.drexel.edu" --> "3643" +ARRAYID=`echo "$JOBID" | sed "s/\\([0-9]*\\)\\([.].*\\)/\\1/g"` +for job in $VJOBIDS; do qrls $job; done + +popd > /dev/null + +# wait until the final job exits +qwait $ARRAYID + +exit 0 diff --git a/pysawsim/vel_dep_graph b/pysawsim/vel_dep_graph new file mode 100755 index 0000000..191c194 --- /dev/null +++ b/pysawsim/vel_dep_graph @@ -0,0 +1,23 @@ +#!/bin/bash +# +# Once we have force lists for each velocity, make histograms and F(V) chart. + +> v_dep +for V in $* +do + # make histograms + OUT=`python -c "import numpy; a=numpy.fromfile('data_$V', sep=' '); print a.mean(), a.std()"` + AVG=`echo "$OUT" | cut -d\ -f1` + STD=`echo "$OUT" | cut -d\ -f2` + #AVG=`awk 'BEGIN{s=0}{s += $1}END{print s/NR}' data_$V` + #STD=`awk "BEGIN{s=0}{s += (\$1-$AVG)**2}END{print ... s/NR}" data_$V` + echo -e "$V\t$AVG\t$STD" >> v_dep + cat data_$V | stem_leaf -v-1 -b1e-12 | cut -d\ -f1,3 > hist_$V +done + +echo "#set terminal png +#set output 'v_dep.png' +set logscale x +set xlabel 'Pulling speed (um/s)' +set ylabel 'Unfolding force (pN)' +plot 'v_dep' using (\$1*1e6):(\$2*1e12) with points" > v_dep.gp -- 2.26.2