Introduce pysawsim in the README and add fit_force_histograms.py & friends.
[sawsim.git] / pysawsim / fit_force_histograms.py
1 #!/usr/bin/python
2
3 import os # HACK, for getpid()
4 import os.path
5 import pickle
6 import numpy
7 from scipy.optimize import leastsq
8 import shutil
9 from subprocess import Popen, PIPE
10 import sys
11
12 import matplotlib
13 matplotlib.use('Agg') # select backend that doesn't require X Windows
14 import pylab
15
16 FIGURE = pylab.figure() # avoid memory problems.
17 # pylab keeps internal references to all created figures, so share a
18 # single instance.
19 MEM_DEBUG = False
20
21 class CommandError(Exception):
22     def __init__(self, command, status, stdout, stderr):
23         strerror = ["Command failed (%d):\n  %s\n" % (status, stderr),
24                     "while executing\n  %s" % command]
25         Exception.__init__(self, "\n".join(strerror))
26         self.command = command
27         self.status = status
28         self.stdout = stdout
29         self.stderr = stderr
30
31 def invoke(cmd_string, stdin=None, expect=(0,), cwd=None, verbose=True):
32     """
33     expect should be a tuple of allowed exit codes.  cwd should be
34     the directory from which the command will be executed.
35     """
36     if cwd == None:
37         cwd = "."
38     if verbose == True:
39         print >> sys.stderr, "%s$ %s" % (cwd, cmd_string)
40     try :
41         if sys.platform != "win32" and False:
42             q = Popen(cmd_string, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=cwd)
43         else:
44             # win32 don't have os.execvp() so have to run command in a shell
45             q = Popen(cmd_string, stdin=PIPE, stdout=PIPE, stderr=PIPE,
46                       shell=True, cwd=cwd)
47     except OSError, e :
48         raise CommandError(args, status=e.args[0], stdout="", stderr=e)
49     output,error = q.communicate(input=stdin)
50     status = q.wait()
51     if verbose == True:
52         print >> sys.stderr, "%d\n%s%s" % (status, output, error)
53     if status not in expect:
54         raise CommandError(args, status, output, error)
55     return status, output, error
56
57 def rss():
58     """
59     For debugging memory usage.
60
61     resident set size, the non-swapped physical memory that a task has
62     used (in kiloBytes).
63     """
64     call = "ps -o rss= -p %d" % os.getpid()
65     status,stdout,stderr = invoke(call)
66     return int(stdout)
67
68 class Histogram (object):
69     def from_data(self, data, bin_edges):
70         """
71         All bins should be of equal width (so we can calculate which
72         bin a data point belongs to).
73
74         data should be a numpy array.
75         """
76         self.bin_edges = bin_edges
77         bin_width = self.bin_edges[1] - self.bin_edges[0]
78
79         bin_is = numpy.floor((data - self.bin_edges[0])/bin_width)
80         self.counts = []
81         for i in range(len(self.bin_edges)-1):
82             self.counts.append(sum(bin_is == i))
83         self.total = float(len(data)) # because some data might be outside the bins
84         self.mean = data.mean()
85         self.std_dev = data.std()
86     def from_file(self, h_file):
87         """
88         h_file format:
89         # comment and blank lines ignored
90         <unfolding force in N><TAB><count>
91         ...
92
93         e.g.
94
95         150e-12 10
96         200e-12 40
97         250e-12 5
98
99         The unfolding force should mark the left-hand side of the bin, and
100         all bins should be of equal width (so we know where the last one ends).
101         """
102         self.bin_edges = []
103         self.counts = []
104         for line in file(h_file, "r").readlines():
105             line = line.strip()
106             if len(line) == 0 or line[0] == "#":
107                 continue # ignore blank lines and comments
108             force,count = line.split() #"\t")
109             self.bin_edges.append(float(force))
110             self.counts.append(float(count))
111         bin_width = self.bin_edges[1] - self.bin_edges[0]
112         self.bin_edges.append(self.counts[-1]+bin_width)
113         self.total = float(sum(self.counts))
114         self.mean = 0
115         for bin,count in zip(self.bin_edges, self.counts):
116             bin += bin_width / 2.0
117             self.mean += bin * count
118         self.mean /=  self.total
119         variance = 0
120         for bin,count in zip(self.bin_edges, self.counts):
121             bin += bin_width / 2.0
122             variance += (bin - self.mean)**2 * count
123         self.std_dev = numpy.sqrt(variance)
124     def normalize(self):
125         self.probabilities = []
126         self.total = float(self.total)
127         for count in self.counts:
128             self.probabilities.append(count/self.total)
129     def residual(self, other, type='bin-height', plot=False, title=None):
130         """
131         Each bin is weighted by the probability of unfolding in that
132         bin for the self histogram.  For residual types where there is
133         a convention, this histogram is treated as the experimental
134         histogram and the other histogram is treated as the
135         theoretical one.
136         """
137         if type in ['jensen-shannon', 'chi-squared']:
138             assert self.bin_edges == other.bin_edges
139         if type == 'jensen-shannon':
140             def d_KL_term(p,q):
141                 """
142                 Kullback-Leibler divergence for a single bin, with the
143                 exception that we return 0 if q==0, rather than
144                 exploding like d_KL should.  We can do this because
145                 the way d_KL_term is used in the Jensen-Shannon
146                 divergence, if q (really m) == 0, then p also == 0,
147                 and the Jensen-Shannon divergence for the entire term
148                 should be zero.
149                 """
150                 if p==0 or q==0 or p==q:
151                     return 0.0
152                 return p * numpy.log2(p/q)
153             residual = 0
154             for probA,probB in zip(self.probabilities, other.probabilities):
155                 m = (probA+probB) / 2.0
156                 residual += 0.5*(d_KL_term(probA,m) + d_KL_term(probB,m))
157         elif type == 'chi-squared':
158             residual = 0
159             for probA,probB in zip(self.probabilities, other.probabilities):
160                 residual += (probA-probB)**2 / probB
161         elif type == 'mean':
162             residual = (other.mean - self.mean)**2
163         elif type == 'std-dev':
164             residual = (other.std_dev - self.std_dev)**2
165         else:
166             raise NotImplementedError, type
167         if plot == True:
168             self.plot_residual_comparison(other, residual, title)
169         return residual #*self.total # also weight by counts in our histogram
170     def plot_residual_comparison(self, other, residual=None, title=None):
171         if not hasattr(self, "_tag_i"):
172             self._tag_i = 0
173         self._tag_i += 1
174
175         if residual == None:
176             residual = self.residual(other)
177         FIGURE.clear()
178         p = pylab.plot(self.bin_edges[:-1], self.probabilities, 'r-',
179                        other.bin_edges[:-1], other.probabilities, 'b-')
180         if title != None:
181             pylab.title(title)
182         FIGURE.savefig("residual-%g-%d.png" % (residual, self._tag_i))
183
184
185 class HistogramMatcher (object):
186     def __init__(self, velocity_file, param_format_string, dir_prefix="v_dep"):
187         self.experiment_histograms = self.read_force_histograms(velocity_file)
188         self.param_format_string = param_format_string
189         self.dir_prefix = dir_prefix
190     def read_force_histograms(self, v_file):
191         """
192         v_file format:
193         # comment and blank lines ignored
194         <velocity in m/s><TAB><path to histogram file>
195         ...
196
197         e.g.
198
199         5e-7    histA
200         1e-6    histB
201         """
202         histograms = {}
203         v_file_dir = os.path.dirname(v_file)
204         for line in file(v_file, "r").readlines():
205             line = line.strip()
206             if len(line) == 0 or line[0] == "#":
207                 continue # ignore blank lines and comments
208             v,h_file = line.split("\t")
209             h = Histogram()
210             h.from_file(os.path.join(v_file_dir, h_file))
211             histograms[v] = h
212             histograms[v].normalize()
213         return histograms
214     def dirname(self, params):
215         if not hasattr(self, "_tag_i"):
216             self._tag_i = 0
217         self._tag_i += 1
218         return "%s-%d" % (self.dir_prefix, self._tag_i)
219     def generate_sawsim_histograms(self, params, histograms,
220                                    run_repeat_simulation=True):
221         velocities = histograms.keys()
222         dirname = self.dirname(params)
223         cmd = "run_vel_dep %s %s %s" \
224               % (dirname,
225                  ",".join([str(v) for v in velocities]),
226                  self.param_string(params))
227         if os.path.exists(dirname) and run_repeat_simulation == True:
228             print 'repeating simulation'
229             shutil.rmtree(dirname)
230         if not os.path.exists(dirname):
231             invoke(cmd)
232         sawsim_histograms = {}
233         for velocity in velocities:
234             unfolding_forces = self.get_all_unfolding_data(dirname, str(velocity))
235             bin_edges = histograms[velocity].bin_edges
236             h = Histogram()
237             h.from_data(unfolding_forces, bin_edges)
238             sawsim_histograms[velocity] = h
239             sawsim_histograms[velocity].normalize()
240         return sawsim_histograms
241     def param_string(self, params):
242         """
243         Generate a string of non-velocity options to pass to sawsim
244         for a given parameter list.
245         """
246         return self.param_format_string % (params[0], params[1])#tuple(params)
247     def get_all_unfolding_data(self, dirname, velocity_string):
248         datafile = os.path.join(dirname, "data_" + velocity_string)
249         return numpy.fromfile(datafile, sep=" ")
250     def residual(self, params, type='bin-height', plot=False,
251                  run_repeat_simulation=True):
252         sawsim_histograms = self.generate_sawsim_histograms( \
253             params, self.experiment_histograms,
254             run_repeat_simulation=run_repeat_simulation)
255         assert sorted(sawsim_histograms.keys()) == sorted(self.experiment_histograms.keys())
256         residual = 0
257         for velocity,experiment_histogram in self.experiment_histograms.items():
258             sawsim_histogram = sawsim_histograms[velocity]
259             title = None
260             if plot == True:
261                 title = ", ".join(["%g" % p for p in params])
262             residual += experiment_histogram.residual( \
263                 sawsim_histogram, type=type, plot=plot, title=title)
264         print "residual", residual
265         del(sawsim_histograms)
266         return residual
267     def fit(self, initial_params, verbose=False):
268         p,cov,info,mesg,ier = leastsq(self.residual, initial_params,
269                                       full_output=True, maxfev=1000)
270         if verbose == True:
271             print "Fitted params:",p
272             print "Covariance mx:",cov
273             print "Info:", info
274             print "mesg:", mesg
275             if ier == 1:
276                 print "Solution converged"
277             else:
278                 print "Solution did not converge"
279         return p
280     def plot(self, param_ranges, logx=False, logy=False,
281              residual_type='bin-height',
282              plot_residuals=False, contour=False,
283              run_repeat_simulations=True):
284         xranges = param_ranges[0]
285         yranges = param_ranges[1]
286         if logx == False:
287             x = numpy.linspace(*xranges)
288         else:
289             m,M,n = xranges
290             x = numpy.exp(numpy.linspace(numpy.log(m), numpy.log(M), n))
291         if logy == False:
292             y = numpy.linspace(*yranges)
293         else:
294             m,M,n = yranges
295             y = numpy.exp(numpy.linspace(numpy.log(m), numpy.log(M), n))
296         X, Y = pylab.meshgrid(x,y)
297         C = numpy.zeros((len(y)-1, len(x)-1))
298         for i,xi in enumerate(x[:-1]):
299             for j,yi in enumerate(y[:-1]):
300                 print i, j, i*(len(y)-1) + j, (len(x)-1)*(len(y)-1)
301                 params = (xi,yi)
302                 r = self.residual(params, type=residual_type,
303                                   plot=plot_residuals,
304                                   run_repeat_simulation=run_repeat_simulations)
305                 C[j,i] = numpy.log(r) # better resolution in valleys
306                 if MEM_DEBUG == True:
307                     print >> sys.stderr, "RSS: %d KB" % rss()
308         C = numpy.nan_to_num(C) # NaN -> 0
309         fid = file("fit_force_histograms-XYC.pkl", "wb")
310         pickle.dump([X,Y,C], fid)
311         fid.close()
312         # read in with
313         # import pickle
314         # [X,Y,C] = pickle.load(file("fit_force_histograms-XYC.pkl", "rb"))
315         # ...
316         FIGURE.clear()
317         axes = FIGURE.add_subplot(111)
318         if logx == True:
319             axes.set_xscale('log')
320         if logy == True:
321             axes.set_yscale('log')
322         if contour == True:
323             p = axes.contour(X[:-1,:-1], Y[:-1,:-1], C)
324             # [:-1,:-1] to strip dummy last row & column from X&Y.
325         else: # pseudocolor plot
326             p = axes.pcolor(X, Y, C)
327             axes.autoscale_view(tight=True)
328         FIGURE.colorbar(p)
329         FIGURE.savefig("figure.png")
330
331 def parse_param_ranges_string(string):
332     """
333     '[Amin,Amax,Asteps],[Bmin,Bmax,Bsteps],...'
334       ->
335     [[Amin,Amax,Asteps],[Bmin,Bmax,Bsteps],...]
336     """
337     ranges = []
338     for range_string in string.split("],["):
339         range_number_strings = range_string.strip("[]").split(",")
340         ranges.append([float(x) for x in range_number_strings])
341     return ranges
342
343 if __name__ == "__main__":
344     import optparse
345     usage = "%prog [options] velocity_file"
346
347     parser = optparse.OptionParser(usage)
348     parser.add_option("-f","--param-format", dest="param_format",
349                       metavar="FORMAT",
350                       help="Convert params to sawsim options (%default).",
351                       default=('-s cantilever,hooke,0.05 -N1 -s folded,null -N8 -s "unfolded,wlc,{0.39e-9,28e-9}" -k "folded,unfolded,bell,{%g,%g}" -q folded'))
352     parser.add_option("-p","--initial-params", dest="initial_params",
353                       metavar="PARAMS",
354                       help="Initial params for fitting (%default).",
355                       default='3.3e-4,0.25e-9')
356     parser.add_option("-r","--param-range", dest="param_range",
357                       metavar="PARAMS",
358                       help="Param range for plotting (%default).",
359                       default='[1e-5,1e-3,20],[0.1e-9,1e-9,20]')
360     parser.add_option("--logx", dest="logx",
361                       help="Use a log scale for the x range.",
362                       default=False, action="store_true")
363     parser.add_option("--logy", dest="logy",
364                       help="Use a log scale for the y range.",
365                       default=False, action="store_true")
366     parser.add_option("-d","--dir-prefix", dest="dir_prefix",
367                       metavar="STRING",
368                       help="Prefix to sawsim data directories (%default).",
369                       default='v_dep')
370     parser.add_option("-R","--residual", dest="residual",
371                       metavar="STRING",
372                       help="Residual type (from 'jensen-shannon', 'chi-squared', 'mean', 'std-dev'; default: %default).",
373                       default='jensen-shannon')
374     parser.add_option("-P","--plot-residuals", dest="plot_residuals",
375                       help="Generate residual difference plots for each point in the plot range.",
376                       default=False, action="store_true")
377     parser.add_option("-c","--contour-plot", dest="contour_plot",
378                       help="Select contour plot (vs. the default pseudocolor plot).",
379                       default=False, action="store_true")
380     parser.add_option("-n","--no-repeat-simulations", dest="simulations",
381                       help="Do not run simulations if they already exist (re-analyze a previous run's output).",
382                       default=True, action="store_false")
383     options,args = parser.parse_args()
384     initial_params = [float(p) for p in options.initial_params.split(",")]
385     param_ranges = parse_param_ranges_string(options.param_range)
386     velocity_file = args[0]
387
388     hm = HistogramMatcher(velocity_file, options.param_format, options.dir_prefix)
389     #hm.fit(initial_params, verbose=True)
390     hm.plot(param_ranges, logx=options.logx, logy=options.logy,
391             residual_type=options.residual,
392             plot_residuals=options.plot_residuals,
393             contour=options.contour_plot,
394             run_repeat_simulations=options.simulations)