From 3b7a503f28b315ddaf28c28e6c1f3653cd490d5c Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Tue, 19 Oct 2010 21:27:12 -0400 Subject: [PATCH] Transition more of the pysawsim framework into Python. --- pysawsim/run_vel_dep | 97 ---------- pysawsim/sawsim_histogram.py | 134 ++++++++++++++ pysawsim/vel_dep_graph | 0 ...stograms.py => velocity_dependant_scan.py} | 173 ++++++++---------- 4 files changed, 214 insertions(+), 190 deletions(-) delete mode 100755 pysawsim/run_vel_dep create mode 100644 pysawsim/sawsim_histogram.py mode change 100755 => 100644 pysawsim/vel_dep_graph rename pysawsim/{fit_force_histograms.py => velocity_dependant_scan.py} (64%) mode change 100755 => 100644 diff --git a/pysawsim/run_vel_dep b/pysawsim/run_vel_dep deleted file mode 100755 index cb47846..0000000 --- a/pysawsim/run_vel_dep +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/bash -# -# run_vel_dep -# -# run BN*N simulations each at a series of pulling speeds V -# (non-velocity parameters are copied from the run_vel_dep command -# line) for each speed, save all unfolding data data_$V, and generate -# a histogram of unfolding forces hist_$V also generate a file v_dep -# with the mean unfolding force vs velocity. -# -# usage: run_vel_dep OUTPUT_DIR VS SAWSIM PARAMS... -# e.g. run_vel_dep v_dep-123xs \ -# 1e-9,1e-8,1e-7,1e-6,1e-5 \ -# -s cantilever,hooke,0.05 -N1 \ -# -s folded,null -N8 \ -# -s 'unfolded,wlc,{0.39e-9,28e-9}' \ -# -k 'folded,unfolded,bell,{3.3e-4,0.25e-9}' \ -# -q folded - -if [ $# -lt 3 ]; then - STAMP=`date +"%Y.%m.%d.%H.%M.%S"` - echo 'usage: run_vel_dep OUTPUT_DIR VS SAWSIM PARAMS...' - echo "e.g. run_vel_dep v_dep-$STAMP \\" - echo ' 1e-9,1e-8,1e-7,1e-6,1e-5 \' - echo ' -s cantilever,hooke,0.05 -N1 \' - echo ' -s folded,null -N8 \' - echo ' -s "unfolded,wlc,{0.39e-9,28e-9}" \' - echo ' -k "folded,unfolded,bell,{3.3e-4,0.25e-9}" \' - echo ' -q folded' - exit 1 -fi - -DIR="$1" -shift -Vs=`echo $1 | sed 's/,/ /g'` -shift -# Protect special chars from later shell expansion by quoting and -# escaping and " characters that may have been in $arg already. -Q='"' -EQ='\"' -SAWSIM_PARAMS=`for arg in $@; do echo -n "$Q${arg//$Q/$EQ}$Q "; done` - - -# We can't depend on the entire array in one shot, so -# batch executions into a single array instance -BN=20 # sawsim executions per array job instance -#N=8 # number of arrays -N=20 # number of arrays -SAFTEY_SLEEP=2 # seconds to sleep between spawning array job and depending on it - # otherwise dependencies don't catch - -mkdir "$DIR" || exit 1 -pushd "$DIR" > /dev/null -echo "Date "`date` > v_dep_notes -echo "Run$ sawsim $SAWSIM_PARAMS -v \$V" >> v_dep_notes -echo "for V in $Vs" >> v_dep_notes - -Is=`seq 1 $N` # a list of allowed 'i's, since 'for' more compact than 'while' - -vdone_condition="afterany" -VJOBIDS="" -for V in $Vs -do - # run N sawtooth simulations - idone_condition="afterany" - cmd="cd \$PBS_O_WORKDIR - for i in \`seq 1 $BN\`; do - ~/sawsim/sawsim $SAWSIM_PARAMS -v $V > run_$V-\$PBS_ARRAYID-\$i - done" - JOBID=`echo "$cmd" | qsub -h -t 1-$N -N "sawsim" -o run_$V.o -e run_$V.e` || exit 1 - # "3643.abax.physics.drexel.edu" --> "3643" - ARRAYID=`echo "$JOBID" | sed "s/\\([0-9]*\\)\\([.].*\\)/\\1/g"` - for i in $Is; do idone_condition="$idone_condition:$ARRAYID-$i"; done - - sleep $SAFTEY_SLEEP - # once they're done, compile a list of all unfolding forces for each speed - JOBID=`echo "cd \\\$PBS_O_WORKDIR && cat run_$V-* | grep -v '#' | cut -f1 > data_$V" |\ - qsub -h -W depend="$idone_condition" -N sawComp -o data_$V.o -e data_$V.e` \ - || exit 1 - vdone_condition="$vdone_condition:$JOBID" - VJOBIDS="$VJOBIDS $JOBID" - for i in $Is; do qrls "$ARRAYID-$i"; done -done - -# once we have force lists for each velocity, make histograms and F(V) chart -JOBID=`echo "cd \\\$PBS_O_WORKDIR && $HOME/script/vel_dep_graph $Vs" | \ - qsub -W depend="$vdone_condition" -N sawsimGraph` || exit 1 -# "3643.abax.physics.drexel.edu" --> "3643" -ARRAYID=`echo "$JOBID" | sed "s/\\([0-9]*\\)\\([.].*\\)/\\1/g"` -for job in $VJOBIDS; do qrls $job; done - -popd > /dev/null - -# wait until the final job exits -qwait $ARRAYID - -exit 0 diff --git a/pysawsim/sawsim_histogram.py b/pysawsim/sawsim_histogram.py new file mode 100644 index 0000000..848ca90 --- /dev/null +++ b/pysawsim/sawsim_histogram.py @@ -0,0 +1,134 @@ +# Copyright (C) 2009-2010 W. Trevor King +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# The author may be contacted at on the Internet, or +# write to Trevor King, Drudge's University, Physics Dept., 3141 Chestnut St., +# Philadelphia PA 19104, USA. + +from __future__ import with_statement + +import hashlib +import os.path +import shutil + +import numpy + +from . import __version__, log +from .histogram import Histogram +from .manager import InvokeJob +from .sawsim import parse + + +SAWSIM = 'sawsim' # os.path.expand(os.path.join('~', 'bin', 'sawsim')) +CACHE_DIR = os.path.expanduser(os.path.join('~', '.sawsim_histogram')) +DEFAULT_PARAM_STRING = ( + '-s cantilever,hooke,0.05 -N1 ' + '-s folded,null -N8 ' + "-s 'unfolded,wlc,{0.39e-9,28e-9}' " + "-k 'folded,unfolded,bell,{3.3e-4,0.25e-9}' " + '-q folded -v 1e-6') + + +class SawsimHistogram (object): + def __init__(self, use_cache=False, clean_cache=False): + self._use_cache = use_cache + self._clean_cache = clean_cache + + def _cache_dir(self, param_string): + """ + >>> s = SawsimHistogram() + >>> s._cache_dir(DEFAULT_PARAM_STRING) # doctest: +ELLIPSIS + '/.../.sawsim_histogram/...' + """ + return os.path.join( + CACHE_DIR, hashlib.sha256(param_string).hexdigest()) + + def _make_cache(self, param_string): + cache_dir = self._cache_dir(params_string) + os.makedirs(cache_dir) + with open(os.path.join(d, 'param_string'), 'w') as f: + f.write('# version: %s\n%s\n' % (__version__, param_string)) + + def _load_cached_data(self, cache_dir, N): + data = {} + for i in range(N): + simulation_path = os.path.join(d, '%d.dat' % i) + if os.path.exists(simulation_path): + with open(simulation_path, 'r') as f: + data[i] = parse(f.read()) + else: + break + return data + + def _cache_run(self, cache_dir, i, stdout): + simulation_path = os.path.join(d, '%d.dat' % i) + with open(simulation_path, 'w') as f: + f.write(stdout) + + def __call__(self, param_string=None, N=400, bin_edges=None, manager=None): + """Run `N` simulations and return a histogram with `bin_edges`. + + If `bin_edges == None`, return a numpy array of all unfolding + forces. + + Use the `JobManager` instance `manager` for asynchronous job + execution. + + If `_use_cache` is `True`, store an array of unfolding forces + in `CACHE_DIR` for each simulated pull. If the cached forces + are already present for `param_string`, do not redo the + simulation unless `_clean_cache` is `True`. + """ + data = {} + if self._use_cache == True: + d = self._cache_dir(param_string) + if os.path.exists(d): + if self._clean_cache == True: + shutil.rmtree(d) + self._make_cache(param_string) + else: + data = self._load_cached_data(d, N) + log().debug('using %d cached runs for %s' + % (len(data), param_string)) + else: + self._make_cache(param_string) + + jobs = {} + for i in range(N): + if i in data: + continue + jobs[i] = manager.async_invoke(InvokeJob( + '%s %s' % (SAWSIM, param_string))) + complete_jobs = manager.wait([job.id for job in jobs]) + for i,job in jobs.iteritems: + j = complete_jobs[job.id] + assert j.status == 0, j.data + if self._use_cache == True: + self._cache_run(d, i, j.data['stdout']) + data[i] = parse(j.data['stdout']) + del(jobs) + del(complete_jobs) + + # generate histogram + events = [] + for d_i in data.values(): + events.extend([e.force for e in d_i + if e.initial_state == 'folded']) + events = numpy.array(d_i) + if bin_edges == None: + return events + h = Histogram() + h.from_data(events, bin_edges) + return h diff --git a/pysawsim/vel_dep_graph b/pysawsim/vel_dep_graph old mode 100755 new mode 100644 diff --git a/pysawsim/fit_force_histograms.py b/pysawsim/velocity_dependant_scan.py old mode 100755 new mode 100644 similarity index 64% rename from pysawsim/fit_force_histograms.py rename to pysawsim/velocity_dependant_scan.py index 87cc328..f877b44 --- a/pysawsim/fit_force_histograms.py +++ b/pysawsim/velocity_dependant_scan.py @@ -1,4 +1,3 @@ -#!/usr/bin/python # Copyright (C) 2009-2010 W. Trevor King # # This program is free software: you can redistribute it and/or modify @@ -24,12 +23,15 @@ import pickle import shutil import sys - import matplotlib import numpy import pylab from scipy.optimize import leastsq +from . import log +from .histogram import Histogram +from .sawsim_histogram import SawsimHistogram + matplotlib.use('Agg') # select backend that doesn't require X Windows FIGURE = pylab.figure() # avoid memory problems. @@ -44,7 +46,7 @@ def rss(): For debugging memory usage. resident set size, the non-swapped physical memory that a task has - used (in kiloBytes). + used (in kilo-bytes). """ call = "ps -o rss= -p %d" % os.getpid() status,stdout,stderr = invoke(call) @@ -52,16 +54,24 @@ def rss(): class HistogramMatcher (object): - def __init__(self, velocity_file, param_format_string, dir_prefix="v_dep"): - self.experiment_histograms = self.read_force_histograms(velocity_file) + """Compare experimental velocity dependent histograms to simulated data. + + The main entry points are `fit()` and `plot()`. + """ + def __init__(self, velocity_stream, param_format_string, N=400, + residual_type='jensen-shannon', plot=True, use_cache=False): + self.experiment_histograms = self.read_force_histograms(velocity_stream) self.param_format_string = param_format_string - self.dir_prefix = dir_prefix + self.residual_type = residual_type + self.N = N + self.plot = plot + self.sawsim_histogram = SawsimHistogram(use_cache=self.use_cache) - def read_force_histograms(self, v_file): + def read_force_histograms(self, stream): """ v_file format: # comment and blank lines ignored - + ... e.g. @@ -71,36 +81,26 @@ class HistogramMatcher (object): """ histograms = {} v_file_dir = os.path.dirname(v_file) - for line in file(v_file, "r").readlines(): + for line in strem.readlines(): line = line.strip() if len(line) == 0 or line[0] == "#": continue # ignore blank lines and comments - v,h_file = line.split("\t") + v,h_file = line.split() h = Histogram() - h.from_file(os.path.join(v_file_dir, h_file)) + h.from_stream(file(os.path.join(v_file_dir, h_file), 'r')) histograms[v] = h - histograms[v].normalize() return histograms - def dirname(self, params): - if not hasattr(self, "_tag_i"): - self._tag_i = 0 - self._tag_i += 1 - return "%s-%d" % (self.dir_prefix, self._tag_i) - - def generate_sawsim_histograms(self, params, histograms, - run_repeat_simulation=True): - velocities = histograms.keys() - dirname = self.dirname(params) - cmd = "run_vel_dep %s %s %s" \ - % (dirname, - ",".join([str(v) for v in velocities]), - self.param_string(params)) - if os.path.exists(dirname) and run_repeat_simulation == True: - print 'repeating simulation' - shutil.rmtree(dirname) - if not os.path.exists(dirname): - invoke(cmd) + def param_string(self, params, velocity): + """Generate a string of options to pass to `sawsim`. + """ + return '%s -v %g' % ( + self.param_format_string % tuple(params), velocity) + + def get_all_unfolding_data(self, dirname, velocity_string): + datafile = os.path.join(dirname, "data_" + velocity_string) + return numpy.fromfile(datafile, sep=" ") + sawsim_histograms = {} for velocity in velocities: unfolding_forces = self.get_all_unfolding_data(dirname, str(velocity)) @@ -111,58 +111,35 @@ class HistogramMatcher (object): sawsim_histograms[velocity].normalize() return sawsim_histograms - def param_string(self, params): - """ - Generate a string of non-velocity options to pass to sawsim - for a given parameter list. - """ - return self.param_format_string % (params[0], params[1])#tuple(params) - - def get_all_unfolding_data(self, dirname, velocity_string): - datafile = os.path.join(dirname, "data_" + velocity_string) - return numpy.fromfile(datafile, sep=" ") - - def residual(self, params, type='bin-height', plot=False, - run_repeat_simulation=True): - sawsim_histograms = self.generate_sawsim_histograms( \ - params, self.experiment_histograms, - run_repeat_simulation=run_repeat_simulation) - assert sorted(sawsim_histograms.keys()) == sorted(self.experiment_histograms.keys()) + def _residual(self, params): residual = 0 - for velocity,experiment_histogram in self.experiment_histograms.items(): - sawsim_histogram = sawsim_histograms[velocity] - title = None - if plot == True: - title = ", ".join(["%g" % p for p in params]) - r = experiment_histogram.residual(sawsim_histogram, type=type) + for velocity,experiment_hist in self.experiment_histograms.iteritems(): + sawsim_hist = self.sawsim_histogram( + param_string=self.param_string(params, velocity), N=self.N, + bin_edges=experiment_hist.bin_edges) + r = experiment_histogram.residual( + sawsim_hist, type=self.residual_type) residual += r - if plot == True: + if self.plot == True: + title = ", ".join(["%g" % p for p in (params+[velocity])]) filename = "residual-%s-%g.png" % (title.replace(', ','-'), r) - self.plot_residual_comparison( - experiment_histogram, sawsim_histogram, residual=r, + self._plot_residual_comparison( + experiment_hist, sawsim_hist, residual=r, title=title, filename=filename) - print "residual", residual - del(sawsim_histograms) + log().debug('residual: %g' % residual) return residual - def fit(self, initial_params, verbose=False): - p,cov,info,mesg,ier = leastsq(self.residual, initial_params, + def fit(self, initial_params): + p,cov,info,mesg,ier = leastsq(self._residual, initial_params, full_output=True, maxfev=1000) - if verbose == True: - print "Fitted params:",p - print "Covariance mx:",cov - print "Info:", info - print "mesg:", mesg - if ier == 1: - print "Solution converged" - else: - print "Solution did not converge" + _log = log() + _log.info('Fitted params: %s' % p) + _log.info('Covariance mx: %s' % cov) + _log.info('Info: %s' % info) + _log.info('Mesg: %s' % mesg) return p - def plot(self, param_ranges, logx=False, logy=False, - residual_type='bin-height', - plot_residuals=False, contour=False, - run_repeat_simulations=True): + def plot(self, param_ranges, logx=False, logy=False, contour=False): xranges = param_ranges[0] yranges = param_ranges[1] if logx == False: @@ -181,12 +158,10 @@ class HistogramMatcher (object): for j,yi in enumerate(y[:-1]): print i, j, i*(len(y)-1) + j, (len(x)-1)*(len(y)-1) params = (xi,yi) - r = self.residual(params, type=residual_type, - plot=plot_residuals, - run_repeat_simulation=run_repeat_simulations) + r = self._residual(params) C[j,i] = numpy.log(r) # better resolution in valleys if MEM_DEBUG == True: - print >> sys.stderr, "RSS: %d KB" % rss() + log().debug('RSS: %d KB' % rss()) C = numpy.nan_to_num(C) # NaN -> 0 fid = file("fit_force_histograms-XYC.pkl", "wb") pickle.dump([X,Y,C], fid) @@ -210,8 +185,8 @@ class HistogramMatcher (object): FIGURE.colorbar(p) FIGURE.savefig("figure.png") - def plot_residual_comparison(self, expeiment_hist, theory_hist, - residual, title, filename): + def _plot_residual_comparison(self, expeiment_hist, theory_hist, + residual, title, filename): FIGURE.clear() p = pylab.plot(experiment_hist.bin_edges[:-1], experiment_hist.probabilities, 'r-', @@ -222,10 +197,14 @@ class HistogramMatcher (object): def parse_param_ranges_string(string): - """ + """Parse parameter range stings. + '[Amin,Amax,Asteps],[Bmin,Bmax,Bsteps],...' -> [[Amin,Amax,Asteps],[Bmin,Bmax,Bsteps],...] + + >>> parse_param_ranges_string('[1,2,3],[4,5,6]') + [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]] """ ranges = [] for range_string in string.split("],["): @@ -233,6 +212,7 @@ def parse_param_ranges_string(string): ranges.append([float(x) for x in range_number_strings]) return ranges + def main(): import optparse usage = "%prog [options] velocity_file" @@ -250,16 +230,20 @@ def main(): metavar="PARAMS", help="Param range for plotting (%default).", default='[1e-5,1e-3,20],[0.1e-9,1e-9,20]') + parser.add_option("--N", "--number-of-runs", dest="N", + metavar="INT", type='int', + help="Number of sawsim runs at each point in parameter space (%default).", + default=400) parser.add_option("--logx", dest="logx", help="Use a log scale for the x range.", default=False, action="store_true") parser.add_option("--logy", dest="logy", help="Use a log scale for the y range.", default=False, action="store_true") - parser.add_option("-d","--dir-prefix", dest="dir_prefix", + parser.add_option("-d","--cache-dir", dest="cache_dir", metavar="STRING", - help="Prefix to sawsim data directories (%default).", - default='v_dep') + help="Cache directory for sawsim unfolding forces (%default).", + default=sawsim_histogram.CACHE_DIR) parser.add_option("-R","--residual", dest="residual", metavar="STRING", help="Residual type (from 'jensen-shannon', 'chi-squared', 'mean', 'std-dev'; default: %default).", @@ -270,21 +254,24 @@ def main(): parser.add_option("-c","--contour-plot", dest="contour_plot", help="Select contour plot (vs. the default pseudocolor plot).", default=False, action="store_true") - parser.add_option("-n","--no-repeat-simulations", dest="simulations", - help="Do not run simulations if they already exist (re-analyze a previous run's output).", - default=True, action="store_false") + parser.add_option("-C","--use_cache", dest="use_cache", + help="Use cached simulations if they exist (vs. running new simulations) (%default)", + default=False, action="store_true") options,args = parser.parse_args() initial_params = [float(p) for p in options.initial_params.split(",")] param_ranges = parse_param_ranges_string(options.param_range) velocity_file = args[0] - hm = HistogramMatcher(velocity_file, options.param_format, options.dir_prefix) - #hm.fit(initial_params, verbose=True) + sawsim_histogram.CACHE_DIR = options.cache_dir + + hm = HistogramMatcher( + file(velocity_file, 'r'), param_format_string=options.param_format, + N=options.N, residual_type=options.residual, + plot=options.plot_residuals, use_cache=options.use_cache) + #hm.fit(initial_params) hm.plot(param_ranges, logx=options.logx, logy=options.logy, - residual_type=options.residual, - plot_residuals=options.plot_residuals, - contour=options.contour_plot, - run_repeat_simulations=options.simulations) + contour=options.contour_plot) + if __name__ == "__main__": main() -- 2.26.2