3 import os # HACK, for getpid()
7 from scipy.optimize import leastsq
9 from subprocess import Popen, PIPE
13 matplotlib.use('Agg') # select backend that doesn't require X Windows
16 FIGURE = pylab.figure() # avoid memory problems.
17 # pylab keeps internal references to all created figures, so share a
21 class CommandError(Exception):
22 def __init__(self, command, status, stdout, stderr):
23 strerror = ["Command failed (%d):\n %s\n" % (status, stderr),
24 "while executing\n %s" % command]
25 Exception.__init__(self, "\n".join(strerror))
26 self.command = command
31 def invoke(cmd_string, stdin=None, expect=(0,), cwd=None, verbose=True):
33 expect should be a tuple of allowed exit codes. cwd should be
34 the directory from which the command will be executed.
39 print >> sys.stderr, "%s$ %s" % (cwd, cmd_string)
41 if sys.platform != "win32" and False:
42 q = Popen(cmd_string, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=cwd)
44 # win32 don't have os.execvp() so have to run command in a shell
45 q = Popen(cmd_string, stdin=PIPE, stdout=PIPE, stderr=PIPE,
48 raise CommandError(args, status=e.args[0], stdout="", stderr=e)
49 output,error = q.communicate(input=stdin)
52 print >> sys.stderr, "%d\n%s%s" % (status, output, error)
53 if status not in expect:
54 raise CommandError(args, status, output, error)
55 return status, output, error
59 For debugging memory usage.
61 resident set size, the non-swapped physical memory that a task has
64 call = "ps -o rss= -p %d" % os.getpid()
65 status,stdout,stderr = invoke(call)
68 class Histogram (object):
69 def from_data(self, data, bin_edges):
71 All bins should be of equal width (so we can calculate which
72 bin a data point belongs to).
74 data should be a numpy array.
76 self.bin_edges = bin_edges
77 bin_width = self.bin_edges[1] - self.bin_edges[0]
79 bin_is = numpy.floor((data - self.bin_edges[0])/bin_width)
81 for i in range(len(self.bin_edges)-1):
82 self.counts.append(sum(bin_is == i))
83 self.total = float(len(data)) # because some data might be outside the bins
84 self.mean = data.mean()
85 self.std_dev = data.std()
86 def from_file(self, h_file):
89 # comment and blank lines ignored
90 <unfolding force in N><TAB><count>
99 The unfolding force should mark the left-hand side of the bin, and
100 all bins should be of equal width (so we know where the last one ends).
104 for line in file(h_file, "r").readlines():
106 if len(line) == 0 or line[0] == "#":
107 continue # ignore blank lines and comments
108 force,count = line.split() #"\t")
109 self.bin_edges.append(float(force))
110 self.counts.append(float(count))
111 bin_width = self.bin_edges[1] - self.bin_edges[0]
112 self.bin_edges.append(self.counts[-1]+bin_width)
113 self.total = float(sum(self.counts))
115 for bin,count in zip(self.bin_edges, self.counts):
116 bin += bin_width / 2.0
117 self.mean += bin * count
118 self.mean /= self.total
120 for bin,count in zip(self.bin_edges, self.counts):
121 bin += bin_width / 2.0
122 variance += (bin - self.mean)**2 * count
123 self.std_dev = numpy.sqrt(variance)
125 self.probabilities = []
126 self.total = float(self.total)
127 for count in self.counts:
128 self.probabilities.append(count/self.total)
129 def residual(self, other, type='bin-height', plot=False, title=None):
131 Each bin is weighted by the probability of unfolding in that
132 bin for the self histogram. For residual types where there is
133 a convention, this histogram is treated as the experimental
134 histogram and the other histogram is treated as the
137 if type in ['jensen-shannon', 'chi-squared']:
138 assert self.bin_edges == other.bin_edges
139 if type == 'jensen-shannon':
142 Kullback-Leibler divergence for a single bin, with the
143 exception that we return 0 if q==0, rather than
144 exploding like d_KL should. We can do this because
145 the way d_KL_term is used in the Jensen-Shannon
146 divergence, if q (really m) == 0, then p also == 0,
147 and the Jensen-Shannon divergence for the entire term
150 if p==0 or q==0 or p==q:
152 return p * numpy.log2(p/q)
154 for probA,probB in zip(self.probabilities, other.probabilities):
155 m = (probA+probB) / 2.0
156 residual += 0.5*(d_KL_term(probA,m) + d_KL_term(probB,m))
157 elif type == 'chi-squared':
159 for probA,probB in zip(self.probabilities, other.probabilities):
160 residual += (probA-probB)**2 / probB
162 residual = (other.mean - self.mean)**2
163 elif type == 'std-dev':
164 residual = (other.std_dev - self.std_dev)**2
166 raise NotImplementedError, type
168 self.plot_residual_comparison(other, residual, title)
169 return residual #*self.total # also weight by counts in our histogram
170 def plot_residual_comparison(self, other, residual=None, title=None):
171 if not hasattr(self, "_tag_i"):
176 residual = self.residual(other)
178 p = pylab.plot(self.bin_edges[:-1], self.probabilities, 'r-',
179 other.bin_edges[:-1], other.probabilities, 'b-')
182 FIGURE.savefig("residual-%g-%d.png" % (residual, self._tag_i))
185 class HistogramMatcher (object):
186 def __init__(self, velocity_file, param_format_string, dir_prefix="v_dep"):
187 self.experiment_histograms = self.read_force_histograms(velocity_file)
188 self.param_format_string = param_format_string
189 self.dir_prefix = dir_prefix
190 def read_force_histograms(self, v_file):
193 # comment and blank lines ignored
194 <velocity in m/s><TAB><path to histogram file>
203 v_file_dir = os.path.dirname(v_file)
204 for line in file(v_file, "r").readlines():
206 if len(line) == 0 or line[0] == "#":
207 continue # ignore blank lines and comments
208 v,h_file = line.split("\t")
210 h.from_file(os.path.join(v_file_dir, h_file))
212 histograms[v].normalize()
214 def dirname(self, params):
215 if not hasattr(self, "_tag_i"):
218 return "%s-%d" % (self.dir_prefix, self._tag_i)
219 def generate_sawsim_histograms(self, params, histograms,
220 run_repeat_simulation=True):
221 velocities = histograms.keys()
222 dirname = self.dirname(params)
223 cmd = "run_vel_dep %s %s %s" \
225 ",".join([str(v) for v in velocities]),
226 self.param_string(params))
227 if os.path.exists(dirname) and run_repeat_simulation == True:
228 print 'repeating simulation'
229 shutil.rmtree(dirname)
230 if not os.path.exists(dirname):
232 sawsim_histograms = {}
233 for velocity in velocities:
234 unfolding_forces = self.get_all_unfolding_data(dirname, str(velocity))
235 bin_edges = histograms[velocity].bin_edges
237 h.from_data(unfolding_forces, bin_edges)
238 sawsim_histograms[velocity] = h
239 sawsim_histograms[velocity].normalize()
240 return sawsim_histograms
241 def param_string(self, params):
243 Generate a string of non-velocity options to pass to sawsim
244 for a given parameter list.
246 return self.param_format_string % (params[0], params[1])#tuple(params)
247 def get_all_unfolding_data(self, dirname, velocity_string):
248 datafile = os.path.join(dirname, "data_" + velocity_string)
249 return numpy.fromfile(datafile, sep=" ")
250 def residual(self, params, type='bin-height', plot=False,
251 run_repeat_simulation=True):
252 sawsim_histograms = self.generate_sawsim_histograms( \
253 params, self.experiment_histograms,
254 run_repeat_simulation=run_repeat_simulation)
255 assert sorted(sawsim_histograms.keys()) == sorted(self.experiment_histograms.keys())
257 for velocity,experiment_histogram in self.experiment_histograms.items():
258 sawsim_histogram = sawsim_histograms[velocity]
261 title = ", ".join(["%g" % p for p in params])
262 residual += experiment_histogram.residual( \
263 sawsim_histogram, type=type, plot=plot, title=title)
264 print "residual", residual
265 del(sawsim_histograms)
267 def fit(self, initial_params, verbose=False):
268 p,cov,info,mesg,ier = leastsq(self.residual, initial_params,
269 full_output=True, maxfev=1000)
271 print "Fitted params:",p
272 print "Covariance mx:",cov
276 print "Solution converged"
278 print "Solution did not converge"
280 def plot(self, param_ranges, logx=False, logy=False,
281 residual_type='bin-height',
282 plot_residuals=False, contour=False,
283 run_repeat_simulations=True):
284 xranges = param_ranges[0]
285 yranges = param_ranges[1]
287 x = numpy.linspace(*xranges)
290 x = numpy.exp(numpy.linspace(numpy.log(m), numpy.log(M), n))
292 y = numpy.linspace(*yranges)
295 y = numpy.exp(numpy.linspace(numpy.log(m), numpy.log(M), n))
296 X, Y = pylab.meshgrid(x,y)
297 C = numpy.zeros((len(y)-1, len(x)-1))
298 for i,xi in enumerate(x[:-1]):
299 for j,yi in enumerate(y[:-1]):
300 print i, j, i*(len(y)-1) + j, (len(x)-1)*(len(y)-1)
302 r = self.residual(params, type=residual_type,
304 run_repeat_simulation=run_repeat_simulations)
305 C[j,i] = numpy.log(r) # better resolution in valleys
306 if MEM_DEBUG == True:
307 print >> sys.stderr, "RSS: %d KB" % rss()
308 C = numpy.nan_to_num(C) # NaN -> 0
309 fid = file("fit_force_histograms-XYC.pkl", "wb")
310 pickle.dump([X,Y,C], fid)
314 # [X,Y,C] = pickle.load(file("fit_force_histograms-XYC.pkl", "rb"))
317 axes = FIGURE.add_subplot(111)
319 axes.set_xscale('log')
321 axes.set_yscale('log')
323 p = axes.contour(X[:-1,:-1], Y[:-1,:-1], C)
324 # [:-1,:-1] to strip dummy last row & column from X&Y.
325 else: # pseudocolor plot
326 p = axes.pcolor(X, Y, C)
327 axes.autoscale_view(tight=True)
329 FIGURE.savefig("figure.png")
331 def parse_param_ranges_string(string):
333 '[Amin,Amax,Asteps],[Bmin,Bmax,Bsteps],...'
335 [[Amin,Amax,Asteps],[Bmin,Bmax,Bsteps],...]
338 for range_string in string.split("],["):
339 range_number_strings = range_string.strip("[]").split(",")
340 ranges.append([float(x) for x in range_number_strings])
343 if __name__ == "__main__":
345 usage = "%prog [options] velocity_file"
347 parser = optparse.OptionParser(usage)
348 parser.add_option("-f","--param-format", dest="param_format",
350 help="Convert params to sawsim options (%default).",
351 default=('-s cantilever,hooke,0.05 -N1 -s folded,null -N8 -s "unfolded,wlc,{0.39e-9,28e-9}" -k "folded,unfolded,bell,{%g,%g}" -q folded'))
352 parser.add_option("-p","--initial-params", dest="initial_params",
354 help="Initial params for fitting (%default).",
355 default='3.3e-4,0.25e-9')
356 parser.add_option("-r","--param-range", dest="param_range",
358 help="Param range for plotting (%default).",
359 default='[1e-5,1e-3,20],[0.1e-9,1e-9,20]')
360 parser.add_option("--logx", dest="logx",
361 help="Use a log scale for the x range.",
362 default=False, action="store_true")
363 parser.add_option("--logy", dest="logy",
364 help="Use a log scale for the y range.",
365 default=False, action="store_true")
366 parser.add_option("-d","--dir-prefix", dest="dir_prefix",
368 help="Prefix to sawsim data directories (%default).",
370 parser.add_option("-R","--residual", dest="residual",
372 help="Residual type (from 'jensen-shannon', 'chi-squared', 'mean', 'std-dev'; default: %default).",
373 default='jensen-shannon')
374 parser.add_option("-P","--plot-residuals", dest="plot_residuals",
375 help="Generate residual difference plots for each point in the plot range.",
376 default=False, action="store_true")
377 parser.add_option("-c","--contour-plot", dest="contour_plot",
378 help="Select contour plot (vs. the default pseudocolor plot).",
379 default=False, action="store_true")
380 parser.add_option("-n","--no-repeat-simulations", dest="simulations",
381 help="Do not run simulations if they already exist (re-analyze a previous run's output).",
382 default=True, action="store_false")
383 options,args = parser.parse_args()
384 initial_params = [float(p) for p in options.initial_params.split(",")]
385 param_ranges = parse_param_ranges_string(options.param_range)
386 velocity_file = args[0]
388 hm = HistogramMatcher(velocity_file, options.param_format, options.dir_prefix)
389 #hm.fit(initial_params, verbose=True)
390 hm.plot(param_ranges, logx=options.logx, logy=options.logy,
391 residual_type=options.residual,
392 plot_residuals=options.plot_residuals,
393 contour=options.contour_plot,
394 run_repeat_simulations=options.simulations)