From 3b7a503f28b315ddaf28c28e6c1f3653cd490d5c Mon Sep 17 00:00:00 2001
From: "W. Trevor King" <wking@drexel.edu>
Date: Tue, 19 Oct 2010 21:27:12 -0400
Subject: [PATCH] Transition more of the pysawsim framework into Python.

---
 pysawsim/run_vel_dep                          |  97 ----------
 pysawsim/sawsim_histogram.py                  | 134 ++++++++++++++
 pysawsim/vel_dep_graph                        |   0
 ...stograms.py => velocity_dependant_scan.py} | 173 ++++++++----------
 4 files changed, 214 insertions(+), 190 deletions(-)
 delete mode 100755 pysawsim/run_vel_dep
 create mode 100644 pysawsim/sawsim_histogram.py
 mode change 100755 => 100644 pysawsim/vel_dep_graph
 rename pysawsim/{fit_force_histograms.py => velocity_dependant_scan.py} (64%)
 mode change 100755 => 100644

diff --git a/pysawsim/run_vel_dep b/pysawsim/run_vel_dep
deleted file mode 100755
index cb47846..0000000
--- a/pysawsim/run_vel_dep
+++ /dev/null
@@ -1,97 +0,0 @@
-#!/bin/bash
-#
-# run_vel_dep
-#
-# run BN*N simulations each at a series of pulling speeds V
-# (non-velocity parameters are copied from the run_vel_dep command
-# line) for each speed, save all unfolding data data_$V, and generate
-# a histogram of unfolding forces hist_$V also generate a file v_dep
-# with the mean unfolding force vs velocity.
-#
-# usage: run_vel_dep OUTPUT_DIR VS SAWSIM PARAMS...
-# e.g.   run_vel_dep v_dep-123xs \
-#          1e-9,1e-8,1e-7,1e-6,1e-5 \
-#          -s cantilever,hooke,0.05 -N1 \
-#          -s folded,null -N8 \
-#          -s 'unfolded,wlc,{0.39e-9,28e-9}' \
-#          -k 'folded,unfolded,bell,{3.3e-4,0.25e-9}' \
-#          -q folded
-
-if [ $# -lt 3 ]; then
-  STAMP=`date +"%Y.%m.%d.%H.%M.%S"`
-  echo 'usage: run_vel_dep OUTPUT_DIR VS SAWSIM PARAMS...'
-  echo "e.g.   run_vel_dep v_dep-$STAMP \\"
-  echo '   1e-9,1e-8,1e-7,1e-6,1e-5 \'
-  echo '   -s cantilever,hooke,0.05 -N1 \'
-  echo '   -s folded,null -N8 \'
-  echo '   -s "unfolded,wlc,{0.39e-9,28e-9}" \'
-  echo '   -k "folded,unfolded,bell,{3.3e-4,0.25e-9}" \'
-  echo '   -q folded'
-  exit 1
-fi
-
-DIR="$1"
-shift
-Vs=`echo $1 | sed 's/,/ /g'`
-shift
-# Protect special chars from later shell expansion by quoting and
-# escaping and " characters that may have been in $arg already.
-Q='"'
-EQ='\"'
-SAWSIM_PARAMS=`for arg in $@; do echo -n "$Q${arg//$Q/$EQ}$Q "; done`
-
-
-# We can't depend on the entire array in one shot, so
-# batch executions into a single array instance
-BN=20 # sawsim executions per array job instance
-#N=8 # number of arrays
-N=20 # number of arrays
-SAFTEY_SLEEP=2 # seconds to sleep between spawning array job and depending on it
-               # otherwise dependencies don't catch
-
-mkdir "$DIR" || exit 1
-pushd "$DIR" > /dev/null
-echo "Date "`date` > v_dep_notes
-echo "Run$ sawsim $SAWSIM_PARAMS -v \$V" >> v_dep_notes
-echo "for V in $Vs" >> v_dep_notes
-
-Is=`seq 1 $N` # a list of allowed 'i's, since 'for' more compact than 'while'
-
-vdone_condition="afterany"
-VJOBIDS=""
-for V in $Vs
-do
-  # run N sawtooth simulations
-  idone_condition="afterany"
-  cmd="cd \$PBS_O_WORKDIR
-       for i in \`seq 1 $BN\`; do
-         ~/sawsim/sawsim $SAWSIM_PARAMS -v $V > run_$V-\$PBS_ARRAYID-\$i
-       done"
-  JOBID=`echo "$cmd" | qsub -h -t 1-$N -N "sawsim" -o run_$V.o -e run_$V.e` || exit 1
-  #  "3643.abax.physics.drexel.edu" --> "3643"
-  ARRAYID=`echo "$JOBID" | sed "s/\\([0-9]*\\)\\([.].*\\)/\\1/g"`
-  for i in $Is; do idone_condition="$idone_condition:$ARRAYID-$i"; done
-
-  sleep $SAFTEY_SLEEP
-  # once they're done, compile a list of all unfolding forces for each speed
-  JOBID=`echo "cd \\\$PBS_O_WORKDIR && cat run_$V-* | grep -v '#' | cut -f1 > data_$V" |\
-      qsub -h -W depend="$idone_condition" -N sawComp -o data_$V.o -e data_$V.e` \
-        || exit 1
-  vdone_condition="$vdone_condition:$JOBID"
-  VJOBIDS="$VJOBIDS $JOBID"
-  for i in $Is; do qrls "$ARRAYID-$i"; done
-done
-
-# once we have force lists for each velocity, make histograms and F(V) chart
-JOBID=`echo "cd \\\$PBS_O_WORKDIR && $HOME/script/vel_dep_graph $Vs" | \
-    qsub -W depend="$vdone_condition" -N sawsimGraph` || exit 1
-#  "3643.abax.physics.drexel.edu" --> "3643"
-ARRAYID=`echo "$JOBID" | sed "s/\\([0-9]*\\)\\([.].*\\)/\\1/g"`
-for job in $VJOBIDS; do qrls $job; done
-
-popd > /dev/null
-
-# wait until the final job exits
-qwait $ARRAYID
-
-exit 0
diff --git a/pysawsim/sawsim_histogram.py b/pysawsim/sawsim_histogram.py
new file mode 100644
index 0000000..848ca90
--- /dev/null
+++ b/pysawsim/sawsim_histogram.py
@@ -0,0 +1,134 @@
+# Copyright (C) 2009-2010  W. Trevor King <wking@drexel.edu>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# The author may be contacted at <wking@drexel.edu> on the Internet, or
+# write to Trevor King, Drudge's University, Physics Dept., 3141 Chestnut St.,
+# Philadelphia PA 19104, USA.
+
+from __future__ import with_statement
+
+import hashlib
+import os.path
+import shutil
+
+import numpy
+
+from . import __version__, log
+from .histogram import Histogram
+from .manager import InvokeJob
+from .sawsim import parse
+
+
+SAWSIM = 'sawsim'  # os.path.expand(os.path.join('~', 'bin', 'sawsim'))
+CACHE_DIR = os.path.expanduser(os.path.join('~', '.sawsim_histogram'))
+DEFAULT_PARAM_STRING = (
+    '-s cantilever,hooke,0.05 -N1 '
+    '-s folded,null -N8 '
+    "-s 'unfolded,wlc,{0.39e-9,28e-9}' "
+    "-k 'folded,unfolded,bell,{3.3e-4,0.25e-9}' "
+    '-q folded -v 1e-6')
+
+
+class SawsimHistogram (object):
+    def __init__(self, use_cache=False, clean_cache=False):
+        self._use_cache = use_cache
+        self._clean_cache = clean_cache
+
+    def _cache_dir(self, param_string):
+        """
+        >>> s = SawsimHistogram()
+        >>> s._cache_dir(DEFAULT_PARAM_STRING)  # doctest: +ELLIPSIS
+        '/.../.sawsim_histogram/...'
+        """
+        return os.path.join(
+            CACHE_DIR, hashlib.sha256(param_string).hexdigest())
+
+    def _make_cache(self, param_string):
+        cache_dir = self._cache_dir(params_string)
+        os.makedirs(cache_dir)
+        with open(os.path.join(d, 'param_string'), 'w') as f:
+            f.write('# version: %s\n%s\n' % (__version__, param_string))
+
+    def _load_cached_data(self, cache_dir, N):
+        data = {}
+        for i in range(N):
+            simulation_path = os.path.join(d, '%d.dat' % i)
+            if os.path.exists(simulation_path):
+                with open(simulation_path, 'r') as f:
+                    data[i] = parse(f.read())
+            else:
+                break
+        return data
+
+    def _cache_run(self, cache_dir, i, stdout):
+        simulation_path = os.path.join(d, '%d.dat' % i)
+        with open(simulation_path, 'w') as f:
+            f.write(stdout)
+
+    def __call__(self, param_string=None, N=400, bin_edges=None, manager=None):
+        """Run `N` simulations and return a histogram with `bin_edges`.
+
+        If `bin_edges == None`, return a numpy array of all unfolding
+        forces.
+
+        Use the `JobManager` instance `manager` for asynchronous job
+        execution.
+
+        If `_use_cache` is `True`, store an array of unfolding forces
+        in `CACHE_DIR` for each simulated pull.  If the cached forces
+        are already present for `param_string`, do not redo the
+        simulation unless `_clean_cache` is `True`.
+        """
+        data = {}
+        if self._use_cache == True:
+            d = self._cache_dir(param_string)
+            if os.path.exists(d):
+                if self._clean_cache == True:
+                    shutil.rmtree(d)
+                    self._make_cache(param_string)
+                else:
+                    data = self._load_cached_data(d, N)
+                    log().debug('using %d cached runs for %s'
+                                % (len(data), param_string))
+            else:
+                self._make_cache(param_string)
+
+        jobs = {}
+        for i in range(N):
+            if i in data:
+                continue
+            jobs[i] = manager.async_invoke(InvokeJob(
+                    '%s %s' % (SAWSIM, param_string)))
+        complete_jobs = manager.wait([job.id for job in jobs])
+        for i,job in jobs.iteritems:
+            j = complete_jobs[job.id]
+            assert j.status == 0, j.data
+            if self._use_cache == True:
+                self._cache_run(d, i, j.data['stdout'])
+            data[i] = parse(j.data['stdout'])
+        del(jobs)
+        del(complete_jobs)
+
+        # generate histogram
+        events = []
+        for d_i in data.values():
+            events.extend([e.force for e in d_i
+                           if e.initial_state == 'folded'])
+        events = numpy.array(d_i)
+        if bin_edges == None:
+            return events
+        h = Histogram()
+        h.from_data(events, bin_edges)
+        return h
diff --git a/pysawsim/vel_dep_graph b/pysawsim/vel_dep_graph
old mode 100755
new mode 100644
diff --git a/pysawsim/fit_force_histograms.py b/pysawsim/velocity_dependant_scan.py
old mode 100755
new mode 100644
similarity index 64%
rename from pysawsim/fit_force_histograms.py
rename to pysawsim/velocity_dependant_scan.py
index 87cc328..f877b44
--- a/pysawsim/fit_force_histograms.py
+++ b/pysawsim/velocity_dependant_scan.py
@@ -1,4 +1,3 @@
-#!/usr/bin/python
 # Copyright (C) 2009-2010  W. Trevor King <wking@drexel.edu>
 #
 # This program is free software: you can redistribute it and/or modify
@@ -24,12 +23,15 @@ import pickle
 import shutil
 import sys
 
-
 import matplotlib
 import numpy
 import pylab
 from scipy.optimize import leastsq
 
+from . import log
+from .histogram import Histogram
+from .sawsim_histogram import SawsimHistogram
+
 
 matplotlib.use('Agg')  # select backend that doesn't require X Windows
 FIGURE = pylab.figure()  # avoid memory problems.
@@ -44,7 +46,7 @@ def rss():
     For debugging memory usage.
 
     resident set size, the non-swapped physical memory that a task has
-    used (in kiloBytes).
+    used (in kilo-bytes).
     """
     call = "ps -o rss= -p %d" % os.getpid()
     status,stdout,stderr = invoke(call)
@@ -52,16 +54,24 @@ def rss():
 
 
 class HistogramMatcher (object):
-    def __init__(self, velocity_file, param_format_string, dir_prefix="v_dep"):
-        self.experiment_histograms = self.read_force_histograms(velocity_file)
+    """Compare experimental velocity dependent histograms to simulated data.
+
+    The main entry points are `fit()` and `plot()`.
+    """
+    def __init__(self, velocity_stream, param_format_string, N=400,
+                 residual_type='jensen-shannon', plot=True, use_cache=False):
+        self.experiment_histograms = self.read_force_histograms(velocity_stream)
         self.param_format_string = param_format_string
-        self.dir_prefix = dir_prefix
+        self.residual_type = residual_type
+        self.N = N
+        self.plot = plot
+        self.sawsim_histogram = SawsimHistogram(use_cache=self.use_cache)
 
-    def read_force_histograms(self, v_file):
+    def read_force_histograms(self, stream):
         """
         v_file format:
         # comment and blank lines ignored
-        <velocity in m/s><TAB><path to histogram file>
+        <velocity in m/s><whitespace><path to histogram file>
         ...
 
         e.g.
@@ -71,36 +81,26 @@ class HistogramMatcher (object):
         """
         histograms = {}
         v_file_dir = os.path.dirname(v_file)
-        for line in file(v_file, "r").readlines():
+        for line in strem.readlines():
             line = line.strip()
             if len(line) == 0 or line[0] == "#":
                 continue # ignore blank lines and comments
-            v,h_file = line.split("\t")
+            v,h_file = line.split()
             h = Histogram()
-            h.from_file(os.path.join(v_file_dir, h_file))
+            h.from_stream(file(os.path.join(v_file_dir, h_file), 'r'))
             histograms[v] = h
-            histograms[v].normalize()
         return histograms
 
-    def dirname(self, params):
-        if not hasattr(self, "_tag_i"):
-            self._tag_i = 0
-        self._tag_i += 1
-        return "%s-%d" % (self.dir_prefix, self._tag_i)
-
-    def generate_sawsim_histograms(self, params, histograms,
-                                   run_repeat_simulation=True):
-        velocities = histograms.keys()
-        dirname = self.dirname(params)
-        cmd = "run_vel_dep %s %s %s" \
-              % (dirname,
-                 ",".join([str(v) for v in velocities]),
-                 self.param_string(params))
-        if os.path.exists(dirname) and run_repeat_simulation == True:
-            print 'repeating simulation'
-            shutil.rmtree(dirname)
-        if not os.path.exists(dirname):
-            invoke(cmd)
+    def param_string(self, params, velocity):
+        """Generate a string of options to pass to `sawsim`.
+        """
+        return '%s -v %g' % (
+            self.param_format_string % tuple(params), velocity)
+
+    def get_all_unfolding_data(self, dirname, velocity_string):
+        datafile = os.path.join(dirname, "data_" + velocity_string)
+        return numpy.fromfile(datafile, sep=" ")
+
         sawsim_histograms = {}
         for velocity in velocities:
             unfolding_forces = self.get_all_unfolding_data(dirname, str(velocity))
@@ -111,58 +111,35 @@ class HistogramMatcher (object):
             sawsim_histograms[velocity].normalize()
         return sawsim_histograms
 
-    def param_string(self, params):
-        """
-        Generate a string of non-velocity options to pass to sawsim
-        for a given parameter list.
-        """
-        return self.param_format_string % (params[0], params[1])#tuple(params)
-
-    def get_all_unfolding_data(self, dirname, velocity_string):
-        datafile = os.path.join(dirname, "data_" + velocity_string)
-        return numpy.fromfile(datafile, sep=" ")
-
-    def residual(self, params, type='bin-height', plot=False,
-                 run_repeat_simulation=True):
-        sawsim_histograms = self.generate_sawsim_histograms( \
-            params, self.experiment_histograms,
-            run_repeat_simulation=run_repeat_simulation)
-        assert sorted(sawsim_histograms.keys()) == sorted(self.experiment_histograms.keys())
+    def _residual(self, params):
         residual = 0
-        for velocity,experiment_histogram in self.experiment_histograms.items():
-            sawsim_histogram = sawsim_histograms[velocity]
-            title = None
-            if plot == True:
-                title = ", ".join(["%g" % p for p in params])
-            r = experiment_histogram.residual(sawsim_histogram, type=type)
+        for velocity,experiment_hist in self.experiment_histograms.iteritems():
+            sawsim_hist = self.sawsim_histogram(
+                param_string=self.param_string(params, velocity), N=self.N,
+                bin_edges=experiment_hist.bin_edges)
+            r = experiment_histogram.residual(
+                sawsim_hist, type=self.residual_type)
             residual += r
-            if plot == True:
+            if self.plot == True:
+                title = ", ".join(["%g" % p for p in (params+[velocity])])
                 filename = "residual-%s-%g.png" % (title.replace(', ','-'), r)
-                self.plot_residual_comparison(
-                    experiment_histogram, sawsim_histogram, residual=r,
+                self._plot_residual_comparison(
+                    experiment_hist, sawsim_hist, residual=r,
                     title=title, filename=filename)
-        print "residual", residual
-        del(sawsim_histograms)
+        log().debug('residual: %g' % residual)
         return residual
 
-    def fit(self, initial_params, verbose=False):
-        p,cov,info,mesg,ier = leastsq(self.residual, initial_params,
+    def fit(self, initial_params):
+        p,cov,info,mesg,ier = leastsq(self._residual, initial_params,
                                       full_output=True, maxfev=1000)
-        if verbose == True:
-            print "Fitted params:",p
-            print "Covariance mx:",cov
-            print "Info:", info
-            print "mesg:", mesg
-            if ier == 1:
-                print "Solution converged"
-            else:
-                print "Solution did not converge"
+        _log = log()
+        _log.info('Fitted params: %s' % p)
+        _log.info('Covariance mx: %s' % cov)
+        _log.info('Info: %s' % info)
+        _log.info('Mesg: %s' % mesg)
         return p
 
-    def plot(self, param_ranges, logx=False, logy=False,
-             residual_type='bin-height',
-             plot_residuals=False, contour=False,
-             run_repeat_simulations=True):
+    def plot(self, param_ranges, logx=False, logy=False, contour=False):
         xranges = param_ranges[0]
         yranges = param_ranges[1]
         if logx == False:
@@ -181,12 +158,10 @@ class HistogramMatcher (object):
             for j,yi in enumerate(y[:-1]):
                 print i, j, i*(len(y)-1) + j, (len(x)-1)*(len(y)-1)
                 params = (xi,yi)
-                r = self.residual(params, type=residual_type,
-                                  plot=plot_residuals,
-                                  run_repeat_simulation=run_repeat_simulations)
+                r = self._residual(params)
                 C[j,i] = numpy.log(r) # better resolution in valleys
                 if MEM_DEBUG == True:
-                    print >> sys.stderr, "RSS: %d KB" % rss()
+                    log().debug('RSS: %d KB' % rss())
         C = numpy.nan_to_num(C) # NaN -> 0
         fid = file("fit_force_histograms-XYC.pkl", "wb")
         pickle.dump([X,Y,C], fid)
@@ -210,8 +185,8 @@ class HistogramMatcher (object):
         FIGURE.colorbar(p)
         FIGURE.savefig("figure.png")
 
-    def plot_residual_comparison(self, expeiment_hist, theory_hist,
-                                 residual, title, filename):
+    def _plot_residual_comparison(self, expeiment_hist, theory_hist,
+                                  residual, title, filename):
         FIGURE.clear()
         p = pylab.plot(experiment_hist.bin_edges[:-1],
                        experiment_hist.probabilities, 'r-',
@@ -222,10 +197,14 @@ class HistogramMatcher (object):
 
 
 def parse_param_ranges_string(string):
-    """
+    """Parse parameter range stings.
+
     '[Amin,Amax,Asteps],[Bmin,Bmax,Bsteps],...'
       ->
     [[Amin,Amax,Asteps],[Bmin,Bmax,Bsteps],...]
+
+    >>> parse_param_ranges_string('[1,2,3],[4,5,6]')
+    [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]
     """
     ranges = []
     for range_string in string.split("],["):
@@ -233,6 +212,7 @@ def parse_param_ranges_string(string):
         ranges.append([float(x) for x in range_number_strings])
     return ranges
 
+
 def main():
     import optparse
     usage = "%prog [options] velocity_file"
@@ -250,16 +230,20 @@ def main():
                       metavar="PARAMS",
                       help="Param range for plotting (%default).",
                       default='[1e-5,1e-3,20],[0.1e-9,1e-9,20]')
+    parser.add_option("--N", "--number-of-runs", dest="N",
+                      metavar="INT", type='int',
+                      help="Number of sawsim runs at each point in parameter space (%default).",
+                      default=400)
     parser.add_option("--logx", dest="logx",
                       help="Use a log scale for the x range.",
                       default=False, action="store_true")
     parser.add_option("--logy", dest="logy",
                       help="Use a log scale for the y range.",
                       default=False, action="store_true")
-    parser.add_option("-d","--dir-prefix", dest="dir_prefix",
+    parser.add_option("-d","--cache-dir", dest="cache_dir",
                       metavar="STRING",
-                      help="Prefix to sawsim data directories (%default).",
-                      default='v_dep')
+                      help="Cache directory for sawsim unfolding forces (%default).",
+                      default=sawsim_histogram.CACHE_DIR)
     parser.add_option("-R","--residual", dest="residual",
                       metavar="STRING",
                       help="Residual type (from 'jensen-shannon', 'chi-squared', 'mean', 'std-dev'; default: %default).",
@@ -270,21 +254,24 @@ def main():
     parser.add_option("-c","--contour-plot", dest="contour_plot",
                       help="Select contour plot (vs. the default pseudocolor plot).",
                       default=False, action="store_true")
-    parser.add_option("-n","--no-repeat-simulations", dest="simulations",
-                      help="Do not run simulations if they already exist (re-analyze a previous run's output).",
-                      default=True, action="store_false")
+    parser.add_option("-C","--use_cache", dest="use_cache",
+                      help="Use cached simulations if they exist (vs. running new simulations) (%default)",
+                      default=False, action="store_true")
     options,args = parser.parse_args()
     initial_params = [float(p) for p in options.initial_params.split(",")]
     param_ranges = parse_param_ranges_string(options.param_range)
     velocity_file = args[0]
 
-    hm = HistogramMatcher(velocity_file, options.param_format, options.dir_prefix)
-    #hm.fit(initial_params, verbose=True)
+    sawsim_histogram.CACHE_DIR = options.cache_dir
+
+    hm = HistogramMatcher(
+        file(velocity_file, 'r'), param_format_string=options.param_format,
+        N=options.N, residual_type=options.residual,
+        plot=options.plot_residuals, use_cache=options.use_cache)
+    #hm.fit(initial_params)
     hm.plot(param_ranges, logx=options.logx, logy=options.logy,
-            residual_type=options.residual,
-            plot_residuals=options.plot_residuals,
-            contour=options.contour_plot,
-            run_repeat_simulations=options.simulations)
+            contour=options.contour_plot)
+
 
 if __name__ == "__main__":
     main()
-- 
2.26.2