From: W. Trevor King <wking@drexel.edu>
Date: Mon, 18 Oct 2010 20:54:07 +0000 (-0400)
Subject: Introduce pysawsim in the README and add fit_force_histograms.py & friends.
X-Git-Url: http://git.tremily.us/?p=sawsim.git;a=commitdiff_plain;h=2e8536997bad422521e936d1dd7441d3d25d47c7

Introduce pysawsim in the README and add fit_force_histograms.py & friends.
---

diff --git a/README b/README
index 7f86701..46340b8 100644
--- a/README
+++ b/README
@@ -1,8 +1,8 @@
 Compiling
 ---------
 
-Sawsim is written in noweb_ (`noweb`, or the transitional `nowebm`, in
-Debian-based distributions).  Extract Makefile and compile with:
+Sawsim is written in noweb_ (`noweb` in Debian-based distributions).
+Extract Makefile and compile with:
 
   $ notangle -Rmakefile src/sawsim.nw | sed 's/        /\t/' > Makefile
   $ make
@@ -27,6 +27,15 @@ in Debian-based distributions).
 .. _check: http://check.sourceforge.net/
 
 
+Python wrapper
+--------------
+
+The `sawsim` binary only runs a single pull, but you'll probably want
+to run many repetitions to generate enough data for significant
+statistical analysis.  To facilitate this, we provide the `pysawsim`
+module which provides a higher level interface to `sawsim`.
+
+
 License
 -------
 
diff --git a/pysawsim/fit_force_histograms.py b/pysawsim/fit_force_histograms.py
new file mode 100755
index 0000000..09e645c
--- /dev/null
+++ b/pysawsim/fit_force_histograms.py
@@ -0,0 +1,394 @@
+#!/usr/bin/python
+
+import os # HACK, for getpid()
+import os.path
+import pickle
+import numpy
+from scipy.optimize import leastsq
+import shutil
+from subprocess import Popen, PIPE
+import sys
+
+import matplotlib
+matplotlib.use('Agg') # select backend that doesn't require X Windows
+import pylab
+
+FIGURE = pylab.figure() # avoid memory problems.
+# pylab keeps internal references to all created figures, so share a
+# single instance.
+MEM_DEBUG = False
+
+class CommandError(Exception):
+    def __init__(self, command, status, stdout, stderr):
+        strerror = ["Command failed (%d):\n  %s\n" % (status, stderr),
+                    "while executing\n  %s" % command]
+        Exception.__init__(self, "\n".join(strerror))
+        self.command = command
+        self.status = status
+        self.stdout = stdout
+        self.stderr = stderr
+
+def invoke(cmd_string, stdin=None, expect=(0,), cwd=None, verbose=True):
+    """
+    expect should be a tuple of allowed exit codes.  cwd should be
+    the directory from which the command will be executed.
+    """
+    if cwd == None:
+        cwd = "."
+    if verbose == True:
+        print >> sys.stderr, "%s$ %s" % (cwd, cmd_string)
+    try :
+        if sys.platform != "win32" and False:
+            q = Popen(cmd_string, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=cwd)
+        else:
+            # win32 don't have os.execvp() so have to run command in a shell
+            q = Popen(cmd_string, stdin=PIPE, stdout=PIPE, stderr=PIPE,
+                      shell=True, cwd=cwd)
+    except OSError, e :
+        raise CommandError(args, status=e.args[0], stdout="", stderr=e)
+    output,error = q.communicate(input=stdin)
+    status = q.wait()
+    if verbose == True:
+        print >> sys.stderr, "%d\n%s%s" % (status, output, error)
+    if status not in expect:
+        raise CommandError(args, status, output, error)
+    return status, output, error
+
+def rss():
+    """
+    For debugging memory usage.
+
+    resident set size, the non-swapped physical memory that a task has
+    used (in kiloBytes).
+    """
+    call = "ps -o rss= -p %d" % os.getpid()
+    status,stdout,stderr = invoke(call)
+    return int(stdout)
+
+class Histogram (object):
+    def from_data(self, data, bin_edges):
+        """
+        All bins should be of equal width (so we can calculate which
+        bin a data point belongs to).
+
+        data should be a numpy array.
+        """
+        self.bin_edges = bin_edges
+        bin_width = self.bin_edges[1] - self.bin_edges[0]
+
+        bin_is = numpy.floor((data - self.bin_edges[0])/bin_width)
+        self.counts = []
+        for i in range(len(self.bin_edges)-1):
+            self.counts.append(sum(bin_is == i))
+        self.total = float(len(data)) # because some data might be outside the bins
+        self.mean = data.mean()
+        self.std_dev = data.std()
+    def from_file(self, h_file):
+        """
+        h_file format:
+        # comment and blank lines ignored
+        <unfolding force in N><TAB><count>
+        ...
+
+        e.g.
+
+        150e-12 10
+        200e-12 40
+        250e-12 5
+
+        The unfolding force should mark the left-hand side of the bin, and
+        all bins should be of equal width (so we know where the last one ends).
+        """
+        self.bin_edges = []
+        self.counts = []
+        for line in file(h_file, "r").readlines():
+            line = line.strip()
+            if len(line) == 0 or line[0] == "#":
+                continue # ignore blank lines and comments
+            force,count = line.split() #"\t")
+            self.bin_edges.append(float(force))
+            self.counts.append(float(count))
+        bin_width = self.bin_edges[1] - self.bin_edges[0]
+        self.bin_edges.append(self.counts[-1]+bin_width)
+        self.total = float(sum(self.counts))
+        self.mean = 0
+        for bin,count in zip(self.bin_edges, self.counts):
+            bin += bin_width / 2.0
+            self.mean += bin * count
+        self.mean /=  self.total
+        variance = 0
+        for bin,count in zip(self.bin_edges, self.counts):
+            bin += bin_width / 2.0
+            variance += (bin - self.mean)**2 * count
+        self.std_dev = numpy.sqrt(variance)
+    def normalize(self):
+        self.probabilities = []
+        self.total = float(self.total)
+        for count in self.counts:
+            self.probabilities.append(count/self.total)
+    def residual(self, other, type='bin-height', plot=False, title=None):
+        """
+        Each bin is weighted by the probability of unfolding in that
+        bin for the self histogram.  For residual types where there is
+        a convention, this histogram is treated as the experimental
+        histogram and the other histogram is treated as the
+        theoretical one.
+        """
+        if type in ['jensen-shannon', 'chi-squared']:
+            assert self.bin_edges == other.bin_edges
+        if type == 'jensen-shannon':
+            def d_KL_term(p,q):
+                """
+                Kullback-Leibler divergence for a single bin, with the
+                exception that we return 0 if q==0, rather than
+                exploding like d_KL should.  We can do this because
+                the way d_KL_term is used in the Jensen-Shannon
+                divergence, if q (really m) == 0, then p also == 0,
+                and the Jensen-Shannon divergence for the entire term
+                should be zero.
+                """
+                if p==0 or q==0 or p==q:
+                    return 0.0
+                return p * numpy.log2(p/q)
+            residual = 0
+            for probA,probB in zip(self.probabilities, other.probabilities):
+                m = (probA+probB) / 2.0
+                residual += 0.5*(d_KL_term(probA,m) + d_KL_term(probB,m))
+        elif type == 'chi-squared':
+            residual = 0
+            for probA,probB in zip(self.probabilities, other.probabilities):
+                residual += (probA-probB)**2 / probB
+        elif type == 'mean':
+            residual = (other.mean - self.mean)**2
+        elif type == 'std-dev':
+            residual = (other.std_dev - self.std_dev)**2
+        else:
+            raise NotImplementedError, type
+        if plot == True:
+            self.plot_residual_comparison(other, residual, title)
+        return residual #*self.total # also weight by counts in our histogram
+    def plot_residual_comparison(self, other, residual=None, title=None):
+        if not hasattr(self, "_tag_i"):
+            self._tag_i = 0
+        self._tag_i += 1
+
+        if residual == None:
+            residual = self.residual(other)
+        FIGURE.clear()
+        p = pylab.plot(self.bin_edges[:-1], self.probabilities, 'r-',
+                       other.bin_edges[:-1], other.probabilities, 'b-')
+        if title != None:
+            pylab.title(title)
+        FIGURE.savefig("residual-%g-%d.png" % (residual, self._tag_i))
+
+
+class HistogramMatcher (object):
+    def __init__(self, velocity_file, param_format_string, dir_prefix="v_dep"):
+        self.experiment_histograms = self.read_force_histograms(velocity_file)
+        self.param_format_string = param_format_string
+        self.dir_prefix = dir_prefix
+    def read_force_histograms(self, v_file):
+        """
+        v_file format:
+        # comment and blank lines ignored
+        <velocity in m/s><TAB><path to histogram file>
+        ...
+
+        e.g.
+
+        5e-7    histA
+        1e-6    histB
+        """
+        histograms = {}
+        v_file_dir = os.path.dirname(v_file)
+        for line in file(v_file, "r").readlines():
+            line = line.strip()
+            if len(line) == 0 or line[0] == "#":
+                continue # ignore blank lines and comments
+            v,h_file = line.split("\t")
+            h = Histogram()
+            h.from_file(os.path.join(v_file_dir, h_file))
+            histograms[v] = h
+            histograms[v].normalize()
+        return histograms
+    def dirname(self, params):
+        if not hasattr(self, "_tag_i"):
+            self._tag_i = 0
+        self._tag_i += 1
+        return "%s-%d" % (self.dir_prefix, self._tag_i)
+    def generate_sawsim_histograms(self, params, histograms,
+                                   run_repeat_simulation=True):
+        velocities = histograms.keys()
+        dirname = self.dirname(params)
+        cmd = "run_vel_dep %s %s %s" \
+              % (dirname,
+                 ",".join([str(v) for v in velocities]),
+                 self.param_string(params))
+        if os.path.exists(dirname) and run_repeat_simulation == True:
+            print 'repeating simulation'
+            shutil.rmtree(dirname)
+        if not os.path.exists(dirname):
+            invoke(cmd)
+        sawsim_histograms = {}
+        for velocity in velocities:
+            unfolding_forces = self.get_all_unfolding_data(dirname, str(velocity))
+            bin_edges = histograms[velocity].bin_edges
+            h = Histogram()
+            h.from_data(unfolding_forces, bin_edges)
+            sawsim_histograms[velocity] = h
+            sawsim_histograms[velocity].normalize()
+        return sawsim_histograms
+    def param_string(self, params):
+        """
+        Generate a string of non-velocity options to pass to sawsim
+        for a given parameter list.
+        """
+        return self.param_format_string % (params[0], params[1])#tuple(params)
+    def get_all_unfolding_data(self, dirname, velocity_string):
+        datafile = os.path.join(dirname, "data_" + velocity_string)
+        return numpy.fromfile(datafile, sep=" ")
+    def residual(self, params, type='bin-height', plot=False,
+                 run_repeat_simulation=True):
+        sawsim_histograms = self.generate_sawsim_histograms( \
+            params, self.experiment_histograms,
+            run_repeat_simulation=run_repeat_simulation)
+        assert sorted(sawsim_histograms.keys()) == sorted(self.experiment_histograms.keys())
+        residual = 0
+        for velocity,experiment_histogram in self.experiment_histograms.items():
+            sawsim_histogram = sawsim_histograms[velocity]
+            title = None
+            if plot == True:
+                title = ", ".join(["%g" % p for p in params])
+            residual += experiment_histogram.residual( \
+                sawsim_histogram, type=type, plot=plot, title=title)
+        print "residual", residual
+        del(sawsim_histograms)
+        return residual
+    def fit(self, initial_params, verbose=False):
+        p,cov,info,mesg,ier = leastsq(self.residual, initial_params,
+                                      full_output=True, maxfev=1000)
+        if verbose == True:
+            print "Fitted params:",p
+            print "Covariance mx:",cov
+            print "Info:", info
+            print "mesg:", mesg
+            if ier == 1:
+                print "Solution converged"
+            else:
+                print "Solution did not converge"
+        return p
+    def plot(self, param_ranges, logx=False, logy=False,
+             residual_type='bin-height',
+             plot_residuals=False, contour=False,
+             run_repeat_simulations=True):
+        xranges = param_ranges[0]
+        yranges = param_ranges[1]
+        if logx == False:
+            x = numpy.linspace(*xranges)
+        else:
+            m,M,n = xranges
+            x = numpy.exp(numpy.linspace(numpy.log(m), numpy.log(M), n))
+        if logy == False:
+            y = numpy.linspace(*yranges)
+        else:
+            m,M,n = yranges
+            y = numpy.exp(numpy.linspace(numpy.log(m), numpy.log(M), n))
+        X, Y = pylab.meshgrid(x,y)
+        C = numpy.zeros((len(y)-1, len(x)-1))
+        for i,xi in enumerate(x[:-1]):
+            for j,yi in enumerate(y[:-1]):
+                print i, j, i*(len(y)-1) + j, (len(x)-1)*(len(y)-1)
+                params = (xi,yi)
+                r = self.residual(params, type=residual_type,
+                                  plot=plot_residuals,
+                                  run_repeat_simulation=run_repeat_simulations)
+                C[j,i] = numpy.log(r) # better resolution in valleys
+                if MEM_DEBUG == True:
+                    print >> sys.stderr, "RSS: %d KB" % rss()
+        C = numpy.nan_to_num(C) # NaN -> 0
+        fid = file("fit_force_histograms-XYC.pkl", "wb")
+        pickle.dump([X,Y,C], fid)
+        fid.close()
+        # read in with
+        # import pickle
+        # [X,Y,C] = pickle.load(file("fit_force_histograms-XYC.pkl", "rb"))
+        # ...
+        FIGURE.clear()
+        axes = FIGURE.add_subplot(111)
+        if logx == True:
+            axes.set_xscale('log')
+        if logy == True:
+            axes.set_yscale('log')
+        if contour == True:
+            p = axes.contour(X[:-1,:-1], Y[:-1,:-1], C)
+            # [:-1,:-1] to strip dummy last row & column from X&Y.
+        else: # pseudocolor plot
+            p = axes.pcolor(X, Y, C)
+            axes.autoscale_view(tight=True)
+        FIGURE.colorbar(p)
+        FIGURE.savefig("figure.png")
+
+def parse_param_ranges_string(string):
+    """
+    '[Amin,Amax,Asteps],[Bmin,Bmax,Bsteps],...'
+      ->
+    [[Amin,Amax,Asteps],[Bmin,Bmax,Bsteps],...]
+    """
+    ranges = []
+    for range_string in string.split("],["):
+        range_number_strings = range_string.strip("[]").split(",")
+        ranges.append([float(x) for x in range_number_strings])
+    return ranges
+
+if __name__ == "__main__":
+    import optparse
+    usage = "%prog [options] velocity_file"
+
+    parser = optparse.OptionParser(usage)
+    parser.add_option("-f","--param-format", dest="param_format",
+                      metavar="FORMAT",
+                      help="Convert params to sawsim options (%default).",
+                      default=('-s cantilever,hooke,0.05 -N1 -s folded,null -N8 -s "unfolded,wlc,{0.39e-9,28e-9}" -k "folded,unfolded,bell,{%g,%g}" -q folded'))
+    parser.add_option("-p","--initial-params", dest="initial_params",
+                      metavar="PARAMS",
+                      help="Initial params for fitting (%default).",
+                      default='3.3e-4,0.25e-9')
+    parser.add_option("-r","--param-range", dest="param_range",
+                      metavar="PARAMS",
+                      help="Param range for plotting (%default).",
+                      default='[1e-5,1e-3,20],[0.1e-9,1e-9,20]')
+    parser.add_option("--logx", dest="logx",
+                      help="Use a log scale for the x range.",
+                      default=False, action="store_true")
+    parser.add_option("--logy", dest="logy",
+                      help="Use a log scale for the y range.",
+                      default=False, action="store_true")
+    parser.add_option("-d","--dir-prefix", dest="dir_prefix",
+                      metavar="STRING",
+                      help="Prefix to sawsim data directories (%default).",
+                      default='v_dep')
+    parser.add_option("-R","--residual", dest="residual",
+                      metavar="STRING",
+                      help="Residual type (from 'jensen-shannon', 'chi-squared', 'mean', 'std-dev'; default: %default).",
+                      default='jensen-shannon')
+    parser.add_option("-P","--plot-residuals", dest="plot_residuals",
+                      help="Generate residual difference plots for each point in the plot range.",
+                      default=False, action="store_true")
+    parser.add_option("-c","--contour-plot", dest="contour_plot",
+                      help="Select contour plot (vs. the default pseudocolor plot).",
+                      default=False, action="store_true")
+    parser.add_option("-n","--no-repeat-simulations", dest="simulations",
+                      help="Do not run simulations if they already exist (re-analyze a previous run's output).",
+                      default=True, action="store_false")
+    options,args = parser.parse_args()
+    initial_params = [float(p) for p in options.initial_params.split(",")]
+    param_ranges = parse_param_ranges_string(options.param_range)
+    velocity_file = args[0]
+
+    hm = HistogramMatcher(velocity_file, options.param_format, options.dir_prefix)
+    #hm.fit(initial_params, verbose=True)
+    hm.plot(param_ranges, logx=options.logx, logy=options.logy,
+            residual_type=options.residual,
+            plot_residuals=options.plot_residuals,
+            contour=options.contour_plot,
+            run_repeat_simulations=options.simulations)
diff --git a/pysawsim/qwait b/pysawsim/qwait
new file mode 100755
index 0000000..6a3427b
--- /dev/null
+++ b/pysawsim/qwait
@@ -0,0 +1,15 @@
+#!/bin/bash
+#
+# Wait for a job to finish execution
+
+JOBID="$1"
+
+while sleep 3; do
+  STAT=`qstat $JOBID 2>&1 1>/dev/null`
+  UNKOWN=`echo "$STAT" | grep "Unknown Job Id"`
+  if [ -n "$UNKOWN" ]; then
+    exit 0 # job has completed, since qstat doesn't recognize it.
+  fi
+done
+
+exit 1
diff --git a/pysawsim/run_vel_dep b/pysawsim/run_vel_dep
new file mode 100755
index 0000000..cb47846
--- /dev/null
+++ b/pysawsim/run_vel_dep
@@ -0,0 +1,97 @@
+#!/bin/bash
+#
+# run_vel_dep
+#
+# run BN*N simulations each at a series of pulling speeds V
+# (non-velocity parameters are copied from the run_vel_dep command
+# line) for each speed, save all unfolding data data_$V, and generate
+# a histogram of unfolding forces hist_$V also generate a file v_dep
+# with the mean unfolding force vs velocity.
+#
+# usage: run_vel_dep OUTPUT_DIR VS SAWSIM PARAMS...
+# e.g.   run_vel_dep v_dep-123xs \
+#          1e-9,1e-8,1e-7,1e-6,1e-5 \
+#          -s cantilever,hooke,0.05 -N1 \
+#          -s folded,null -N8 \
+#          -s 'unfolded,wlc,{0.39e-9,28e-9}' \
+#          -k 'folded,unfolded,bell,{3.3e-4,0.25e-9}' \
+#          -q folded
+
+if [ $# -lt 3 ]; then
+  STAMP=`date +"%Y.%m.%d.%H.%M.%S"`
+  echo 'usage: run_vel_dep OUTPUT_DIR VS SAWSIM PARAMS...'
+  echo "e.g.   run_vel_dep v_dep-$STAMP \\"
+  echo '   1e-9,1e-8,1e-7,1e-6,1e-5 \'
+  echo '   -s cantilever,hooke,0.05 -N1 \'
+  echo '   -s folded,null -N8 \'
+  echo '   -s "unfolded,wlc,{0.39e-9,28e-9}" \'
+  echo '   -k "folded,unfolded,bell,{3.3e-4,0.25e-9}" \'
+  echo '   -q folded'
+  exit 1
+fi
+
+DIR="$1"
+shift
+Vs=`echo $1 | sed 's/,/ /g'`
+shift
+# Protect special chars from later shell expansion by quoting and
+# escaping and " characters that may have been in $arg already.
+Q='"'
+EQ='\"'
+SAWSIM_PARAMS=`for arg in $@; do echo -n "$Q${arg//$Q/$EQ}$Q "; done`
+
+
+# We can't depend on the entire array in one shot, so
+# batch executions into a single array instance
+BN=20 # sawsim executions per array job instance
+#N=8 # number of arrays
+N=20 # number of arrays
+SAFTEY_SLEEP=2 # seconds to sleep between spawning array job and depending on it
+               # otherwise dependencies don't catch
+
+mkdir "$DIR" || exit 1
+pushd "$DIR" > /dev/null
+echo "Date "`date` > v_dep_notes
+echo "Run$ sawsim $SAWSIM_PARAMS -v \$V" >> v_dep_notes
+echo "for V in $Vs" >> v_dep_notes
+
+Is=`seq 1 $N` # a list of allowed 'i's, since 'for' more compact than 'while'
+
+vdone_condition="afterany"
+VJOBIDS=""
+for V in $Vs
+do
+  # run N sawtooth simulations
+  idone_condition="afterany"
+  cmd="cd \$PBS_O_WORKDIR
+       for i in \`seq 1 $BN\`; do
+         ~/sawsim/sawsim $SAWSIM_PARAMS -v $V > run_$V-\$PBS_ARRAYID-\$i
+       done"
+  JOBID=`echo "$cmd" | qsub -h -t 1-$N -N "sawsim" -o run_$V.o -e run_$V.e` || exit 1
+  #  "3643.abax.physics.drexel.edu" --> "3643"
+  ARRAYID=`echo "$JOBID" | sed "s/\\([0-9]*\\)\\([.].*\\)/\\1/g"`
+  for i in $Is; do idone_condition="$idone_condition:$ARRAYID-$i"; done
+
+  sleep $SAFTEY_SLEEP
+  # once they're done, compile a list of all unfolding forces for each speed
+  JOBID=`echo "cd \\\$PBS_O_WORKDIR && cat run_$V-* | grep -v '#' | cut -f1 > data_$V" |\
+      qsub -h -W depend="$idone_condition" -N sawComp -o data_$V.o -e data_$V.e` \
+        || exit 1
+  vdone_condition="$vdone_condition:$JOBID"
+  VJOBIDS="$VJOBIDS $JOBID"
+  for i in $Is; do qrls "$ARRAYID-$i"; done
+done
+
+# once we have force lists for each velocity, make histograms and F(V) chart
+JOBID=`echo "cd \\\$PBS_O_WORKDIR && $HOME/script/vel_dep_graph $Vs" | \
+    qsub -W depend="$vdone_condition" -N sawsimGraph` || exit 1
+#  "3643.abax.physics.drexel.edu" --> "3643"
+ARRAYID=`echo "$JOBID" | sed "s/\\([0-9]*\\)\\([.].*\\)/\\1/g"`
+for job in $VJOBIDS; do qrls $job; done
+
+popd > /dev/null
+
+# wait until the final job exits
+qwait $ARRAYID
+
+exit 0
diff --git a/pysawsim/vel_dep_graph b/pysawsim/vel_dep_graph
new file mode 100755
index 0000000..191c194
--- /dev/null
+++ b/pysawsim/vel_dep_graph
@@ -0,0 +1,23 @@
+#!/bin/bash
+#
+# Once we have force lists for each velocity, make histograms and F(V) chart.
+
+> v_dep
+for V in $*
+do
+  # make histograms
+  OUT=`python -c "import numpy; a=numpy.fromfile('data_$V', sep=' '); print a.mean(), a.std()"`
+  AVG=`echo "$OUT" | cut -d\  -f1`
+  STD=`echo "$OUT" | cut -d\  -f2`
+  #AVG=`awk 'BEGIN{s=0}{s += $1}END{print s/NR}' data_$V`
+  #STD=`awk "BEGIN{s=0}{s += (\$1-$AVG)**2}END{print ...  s/NR}" data_$V`
+  echo -e "$V\t$AVG\t$STD" >> v_dep
+  cat data_$V | stem_leaf -v-1 -b1e-12 | cut -d\  -f1,3 > hist_$V
+done
+
+echo "#set terminal png
+#set output 'v_dep.png'
+set logscale x
+set xlabel 'Pulling speed (um/s)'
+set ylabel 'Unfolding force (pN)'
+plot  'v_dep' using (\$1*1e6):(\$2*1e12) with points" > v_dep.gp