pysawsim/fit_force_histograms.py

   1 #!/usr/bin/python
   2
   3 import os # HACK, for getpid()
   4 import os.path
   5 import pickle
   6 import numpy
   7 from scipy.optimize import leastsq
   8 import shutil
   9 from subprocess import Popen, PIPE
  10 import sys
  11
  12 import matplotlib
  13 matplotlib.use('Agg') # select backend that doesn't require X Windows
  14 import pylab
  15
  16 FIGURE = pylab.figure() # avoid memory problems.
  17 # pylab keeps internal references to all created figures, so share a
  18 # single instance.
  19 MEM_DEBUG = False
  20
  21 class CommandError(Exception):
  22     def __init__(self, command, status, stdout, stderr):
  23         strerror = ["Command failed (%d):\n  %s\n" % (status, stderr),
  24                     "while executing\n  %s" % command]
  25         Exception.__init__(self, "\n".join(strerror))
  26         self.command = command
  27         self.status = status
  28         self.stdout = stdout
  29         self.stderr = stderr
  30
  31 def invoke(cmd_string, stdin=None, expect=(0,), cwd=None, verbose=True):
  32     """
  33     expect should be a tuple of allowed exit codes.  cwd should be
  34     the directory from which the command will be executed.
  35     """
  36     if cwd == None:
  37         cwd = "."
  38     if verbose == True:
  39         print >> sys.stderr, "%s$ %s" % (cwd, cmd_string)
  40     try :
  41         if sys.platform != "win32" and False:
  42             q = Popen(cmd_string, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=cwd)
  43         else:
  44             # win32 don't have os.execvp() so have to run command in a shell
  45             q = Popen(cmd_string, stdin=PIPE, stdout=PIPE, stderr=PIPE,
  46                       shell=True, cwd=cwd)
  47     except OSError, e :
  48         raise CommandError(args, status=e.args[0], stdout="", stderr=e)
  49     output,error = q.communicate(input=stdin)
  50     status = q.wait()
  51     if verbose == True:
  52         print >> sys.stderr, "%d\n%s%s" % (status, output, error)
  53     if status not in expect:
  54         raise CommandError(args, status, output, error)
  55     return status, output, error
  56
  57 def rss():
  58     """
  59     For debugging memory usage.
  60
  61     resident set size, the non-swapped physical memory that a task has
  62     used (in kiloBytes).
  63     """
  64     call = "ps -o rss= -p %d" % os.getpid()
  65     status,stdout,stderr = invoke(call)
  66     return int(stdout)
  67
  68 class Histogram (object):
  69     def from_data(self, data, bin_edges):
  70         """
  71         All bins should be of equal width (so we can calculate which
  72         bin a data point belongs to).
  73
  74         data should be a numpy array.
  75         """
  76         self.bin_edges = bin_edges
  77         bin_width = self.bin_edges[1] - self.bin_edges[0]
  78
  79         bin_is = numpy.floor((data - self.bin_edges[0])/bin_width)
  80         self.counts = []
  81         for i in range(len(self.bin_edges)-1):
  82             self.counts.append(sum(bin_is == i))
  83         self.total = float(len(data)) # because some data might be outside the bins
  84         self.mean = data.mean()
  85         self.std_dev = data.std()
  86     def from_file(self, h_file):
  87         """
  88         h_file format:
  89         # comment and blank lines ignored
  90         <unfolding force in N><TAB><count>
  91         ...
  92
  93         e.g.
  94
  95         150e-12 10
  96         200e-12 40
  97         250e-12 5
  98
  99         The unfolding force should mark the left-hand side of the bin, and
 100         all bins should be of equal width (so we know where the last one ends).
 101         """
 102         self.bin_edges = []
 103         self.counts = []
 104         for line in file(h_file, "r").readlines():
 105             line = line.strip()
 106             if len(line) == 0 or line[0] == "#":
 107                 continue # ignore blank lines and comments
 108             force,count = line.split() #"\t")
 109             self.bin_edges.append(float(force))
 110             self.counts.append(float(count))
 111         bin_width = self.bin_edges[1] - self.bin_edges[0]
 112         self.bin_edges.append(self.counts[-1]+bin_width)
 113         self.total = float(sum(self.counts))
 114         self.mean = 0
 115         for bin,count in zip(self.bin_edges, self.counts):
 116             bin += bin_width / 2.0
 117             self.mean += bin * count
 118         self.mean /=  self.total
 119         variance = 0
 120         for bin,count in zip(self.bin_edges, self.counts):
 121             bin += bin_width / 2.0
 122             variance += (bin - self.mean)**2 * count
 123         self.std_dev = numpy.sqrt(variance)
 124     def normalize(self):
 125         self.probabilities = []
 126         self.total = float(self.total)
 127         for count in self.counts:
 128             self.probabilities.append(count/self.total)
 129     def residual(self, other, type='bin-height', plot=False, title=None):
 130         """
 131         Each bin is weighted by the probability of unfolding in that
 132         bin for the self histogram.  For residual types where there is
 133         a convention, this histogram is treated as the experimental
 134         histogram and the other histogram is treated as the
 135         theoretical one.
 136         """
 137         if type in ['jensen-shannon', 'chi-squared']:
 138             assert self.bin_edges == other.bin_edges
 139         if type == 'jensen-shannon':
 140             def d_KL_term(p,q):
 141                 """
 142                 Kullback-Leibler divergence for a single bin, with the
 143                 exception that we return 0 if q==0, rather than
 144                 exploding like d_KL should.  We can do this because
 145                 the way d_KL_term is used in the Jensen-Shannon
 146                 divergence, if q (really m) == 0, then p also == 0,
 147                 and the Jensen-Shannon divergence for the entire term
 148                 should be zero.
 149                 """
 150                 if p==0 or q==0 or p==q:
 151                     return 0.0
 152                 return p * numpy.log2(p/q)
 153             residual = 0
 154             for probA,probB in zip(self.probabilities, other.probabilities):
 155                 m = (probA+probB) / 2.0
 156                 residual += 0.5*(d_KL_term(probA,m) + d_KL_term(probB,m))
 157         elif type == 'chi-squared':
 158             residual = 0
 159             for probA,probB in zip(self.probabilities, other.probabilities):
 160                 residual += (probA-probB)**2 / probB
 161         elif type == 'mean':
 162             residual = (other.mean - self.mean)**2
 163         elif type == 'std-dev':
 164             residual = (other.std_dev - self.std_dev)**2
 165         else:
 166             raise NotImplementedError, type
 167         if plot == True:
 168             self.plot_residual_comparison(other, residual, title)
 169         return residual #*self.total # also weight by counts in our histogram
 170     def plot_residual_comparison(self, other, residual=None, title=None):
 171         if not hasattr(self, "_tag_i"):
 172             self._tag_i = 0
 173         self._tag_i += 1
 174
 175         if residual == None:
 176             residual = self.residual(other)
 177         FIGURE.clear()
 178         p = pylab.plot(self.bin_edges[:-1], self.probabilities, 'r-',
 179                        other.bin_edges[:-1], other.probabilities, 'b-')
 180         if title != None:
 181             pylab.title(title)
 182         FIGURE.savefig("residual-%g-%d.png" % (residual, self._tag_i))
 183
 184
 185 class HistogramMatcher (object):
 186     def __init__(self, velocity_file, param_format_string, dir_prefix="v_dep"):
 187         self.experiment_histograms = self.read_force_histograms(velocity_file)
 188         self.param_format_string = param_format_string
 189         self.dir_prefix = dir_prefix
 190     def read_force_histograms(self, v_file):
 191         """
 192         v_file format:
 193         # comment and blank lines ignored
 194         <velocity in m/s><TAB><path to histogram file>
 195         ...
 196
 197         e.g.
 198
 199         5e-7    histA
 200         1e-6    histB
 201         """
 202         histograms = {}
 203         v_file_dir = os.path.dirname(v_file)
 204         for line in file(v_file, "r").readlines():
 205             line = line.strip()
 206             if len(line) == 0 or line[0] == "#":
 207                 continue # ignore blank lines and comments
 208             v,h_file = line.split("\t")
 209             h = Histogram()
 210             h.from_file(os.path.join(v_file_dir, h_file))
 211             histograms[v] = h
 212             histograms[v].normalize()
 213         return histograms
 214     def dirname(self, params):
 215         if not hasattr(self, "_tag_i"):
 216             self._tag_i = 0
 217         self._tag_i += 1
 218         return "%s-%d" % (self.dir_prefix, self._tag_i)
 219     def generate_sawsim_histograms(self, params, histograms,
 220                                    run_repeat_simulation=True):
 221         velocities = histograms.keys()
 222         dirname = self.dirname(params)
 223         cmd = "run_vel_dep %s %s %s" \
 224               % (dirname,
 225                  ",".join([str(v) for v in velocities]),
 226                  self.param_string(params))
 227         if os.path.exists(dirname) and run_repeat_simulation == True:
 228             print 'repeating simulation'
 229             shutil.rmtree(dirname)
 230         if not os.path.exists(dirname):
 231             invoke(cmd)
 232         sawsim_histograms = {}
 233         for velocity in velocities:
 234             unfolding_forces = self.get_all_unfolding_data(dirname, str(velocity))
 235             bin_edges = histograms[velocity].bin_edges
 236             h = Histogram()
 237             h.from_data(unfolding_forces, bin_edges)
 238             sawsim_histograms[velocity] = h
 239             sawsim_histograms[velocity].normalize()
 240         return sawsim_histograms
 241     def param_string(self, params):
 242         """
 243         Generate a string of non-velocity options to pass to sawsim
 244         for a given parameter list.
 245         """
 246         return self.param_format_string % (params[0], params[1])#tuple(params)
 247     def get_all_unfolding_data(self, dirname, velocity_string):
 248         datafile = os.path.join(dirname, "data_" + velocity_string)
 249         return numpy.fromfile(datafile, sep=" ")
 250     def residual(self, params, type='bin-height', plot=False,
 251                  run_repeat_simulation=True):
 252         sawsim_histograms = self.generate_sawsim_histograms( \
 253             params, self.experiment_histograms,
 254             run_repeat_simulation=run_repeat_simulation)
 255         assert sorted(sawsim_histograms.keys()) == sorted(self.experiment_histograms.keys())
 256         residual = 0
 257         for velocity,experiment_histogram in self.experiment_histograms.items():
 258             sawsim_histogram = sawsim_histograms[velocity]
 259             title = None
 260             if plot == True:
 261                 title = ", ".join(["%g" % p for p in params])
 262             residual += experiment_histogram.residual( \
 263                 sawsim_histogram, type=type, plot=plot, title=title)
 264         print "residual", residual
 265         del(sawsim_histograms)
 266         return residual
 267     def fit(self, initial_params, verbose=False):
 268         p,cov,info,mesg,ier = leastsq(self.residual, initial_params,
 269                                       full_output=True, maxfev=1000)
 270         if verbose == True:
 271             print "Fitted params:",p
 272             print "Covariance mx:",cov
 273             print "Info:", info
 274             print "mesg:", mesg
 275             if ier == 1:
 276                 print "Solution converged"
 277             else:
 278                 print "Solution did not converge"
 279         return p
 280     def plot(self, param_ranges, logx=False, logy=False,
 281              residual_type='bin-height',
 282              plot_residuals=False, contour=False,
 283              run_repeat_simulations=True):
 284         xranges = param_ranges[0]
 285         yranges = param_ranges[1]
 286         if logx == False:
 287             x = numpy.linspace(*xranges)
 288         else:
 289             m,M,n = xranges
 290             x = numpy.exp(numpy.linspace(numpy.log(m), numpy.log(M), n))
 291         if logy == False:
 292             y = numpy.linspace(*yranges)
 293         else:
 294             m,M,n = yranges
 295             y = numpy.exp(numpy.linspace(numpy.log(m), numpy.log(M), n))
 296         X, Y = pylab.meshgrid(x,y)
 297         C = numpy.zeros((len(y)-1, len(x)-1))
 298         for i,xi in enumerate(x[:-1]):
 299             for j,yi in enumerate(y[:-1]):
 300                 print i, j, i*(len(y)-1) + j, (len(x)-1)*(len(y)-1)
 301                 params = (xi,yi)
 302                 r = self.residual(params, type=residual_type,
 303                                   plot=plot_residuals,
 304                                   run_repeat_simulation=run_repeat_simulations)
 305                 C[j,i] = numpy.log(r) # better resolution in valleys
 306                 if MEM_DEBUG == True:
 307                     print >> sys.stderr, "RSS: %d KB" % rss()
 308         C = numpy.nan_to_num(C) # NaN -> 0
 309         fid = file("fit_force_histograms-XYC.pkl", "wb")
 310         pickle.dump([X,Y,C], fid)
 311         fid.close()
 312         # read in with
 313         # import pickle
 314         # [X,Y,C] = pickle.load(file("fit_force_histograms-XYC.pkl", "rb"))
 315         # ...
 316         FIGURE.clear()
 317         axes = FIGURE.add_subplot(111)
 318         if logx == True:
 319             axes.set_xscale('log')
 320         if logy == True:
 321             axes.set_yscale('log')
 322         if contour == True:
 323             p = axes.contour(X[:-1,:-1], Y[:-1,:-1], C)
 324             # [:-1,:-1] to strip dummy last row & column from X&Y.
 325         else: # pseudocolor plot
 326             p = axes.pcolor(X, Y, C)
 327             axes.autoscale_view(tight=True)
 328         FIGURE.colorbar(p)
 329         FIGURE.savefig("figure.png")
 330
 331 def parse_param_ranges_string(string):
 332     """
 333     '[Amin,Amax,Asteps],[Bmin,Bmax,Bsteps],...'
 334       ->
 335     [[Amin,Amax,Asteps],[Bmin,Bmax,Bsteps],...]
 336     """
 337     ranges = []
 338     for range_string in string.split("],["):
 339         range_number_strings = range_string.strip("[]").split(",")
 340         ranges.append([float(x) for x in range_number_strings])
 341     return ranges
 342
 343 if __name__ == "__main__":
 344     import optparse
 345     usage = "%prog [options] velocity_file"
 346
 347     parser = optparse.OptionParser(usage)
 348     parser.add_option("-f","--param-format", dest="param_format",
 349                       metavar="FORMAT",
 350                       help="Convert params to sawsim options (%default).",
 351                       default=('-s cantilever,hooke,0.05 -N1 -s folded,null -N8 -s "unfolded,wlc,{0.39e-9,28e-9}" -k "folded,unfolded,bell,{%g,%g}" -q folded'))
 352     parser.add_option("-p","--initial-params", dest="initial_params",
 353                       metavar="PARAMS",
 354                       help="Initial params for fitting (%default).",
 355                       default='3.3e-4,0.25e-9')
 356     parser.add_option("-r","--param-range", dest="param_range",
 357                       metavar="PARAMS",
 358                       help="Param range for plotting (%default).",
 359                       default='[1e-5,1e-3,20],[0.1e-9,1e-9,20]')
 360     parser.add_option("--logx", dest="logx",
 361                       help="Use a log scale for the x range.",
 362                       default=False, action="store_true")
 363     parser.add_option("--logy", dest="logy",
 364                       help="Use a log scale for the y range.",
 365                       default=False, action="store_true")
 366     parser.add_option("-d","--dir-prefix", dest="dir_prefix",
 367                       metavar="STRING",
 368                       help="Prefix to sawsim data directories (%default).",
 369                       default='v_dep')
 370     parser.add_option("-R","--residual", dest="residual",
 371                       metavar="STRING",
 372                       help="Residual type (from 'jensen-shannon', 'chi-squared', 'mean', 'std-dev'; default: %default).",
 373                       default='jensen-shannon')
 374     parser.add_option("-P","--plot-residuals", dest="plot_residuals",
 375                       help="Generate residual difference plots for each point in the plot range.",
 376                       default=False, action="store_true")
 377     parser.add_option("-c","--contour-plot", dest="contour_plot",
 378                       help="Select contour plot (vs. the default pseudocolor plot).",
 379                       default=False, action="store_true")
 380     parser.add_option("-n","--no-repeat-simulations", dest="simulations",
 381                       help="Do not run simulations if they already exist (re-analyze a previous run's output).",
 382                       default=True, action="store_false")
 383     options,args = parser.parse_args()
 384     initial_params = [float(p) for p in options.initial_params.split(",")]
 385     param_ranges = parse_param_ranges_string(options.param_range)
 386     velocity_file = args[0]
 387
 388     hm = HistogramMatcher(velocity_file, options.param_format, options.dir_prefix)
 389     #hm.fit(initial_params, verbose=True)
 390     hm.plot(param_ranges, logx=options.logx, logy=options.logy,
 391             residual_type=options.residual,
 392             plot_residuals=options.plot_residuals,
 393             contour=options.contour_plot,
 394             run_repeat_simulations=options.simulations)