import os # HACK, for getpid()
import os.path
import pickle
-import numpy
-from scipy.optimize import leastsq
import shutil
import sys
+
import matplotlib
-matplotlib.use('Agg') # select backend that doesn't require X Windows
+import numpy
import pylab
+from scipy.optimize import leastsq
+
+
+matplotlib.use('Agg') # select backend that doesn't require X Windows
+FIGURE = pylab.figure() # avoid memory problems.
+"""`pylab` keeps internal references to all created figures, so share
+a single instance.
+"""
-FIGURE = pylab.figure() # avoid memory problems.
-# pylab keeps internal references to all created figures, so share a
-# single instance.
MEM_DEBUG = False
def rss():
status,stdout,stderr = invoke(call)
return int(stdout)
-class Histogram (object):
- def from_data(self, data, bin_edges):
- """
- All bins should be of equal width (so we can calculate which
- bin a data point belongs to).
-
- data should be a numpy array.
- """
- self.bin_edges = bin_edges
- bin_width = self.bin_edges[1] - self.bin_edges[0]
-
- bin_is = numpy.floor((data - self.bin_edges[0])/bin_width)
- self.counts = []
- for i in range(len(self.bin_edges)-1):
- self.counts.append(sum(bin_is == i))
- self.total = float(len(data)) # because some data might be outside the bins
- self.mean = data.mean()
- self.std_dev = data.std()
- def from_file(self, h_file):
- """
- h_file format:
- # comment and blank lines ignored
- <unfolding force in N><TAB><count>
- ...
-
- e.g.
-
- 150e-12 10
- 200e-12 40
- 250e-12 5
-
- The unfolding force should mark the left-hand side of the bin, and
- all bins should be of equal width (so we know where the last one ends).
- """
- self.bin_edges = []
- self.counts = []
- for line in file(h_file, "r").readlines():
- line = line.strip()
- if len(line) == 0 or line[0] == "#":
- continue # ignore blank lines and comments
- force,count = line.split() #"\t")
- self.bin_edges.append(float(force))
- self.counts.append(float(count))
- bin_width = self.bin_edges[1] - self.bin_edges[0]
- self.bin_edges.append(self.counts[-1]+bin_width)
- self.total = float(sum(self.counts))
- self.mean = 0
- for bin,count in zip(self.bin_edges, self.counts):
- bin += bin_width / 2.0
- self.mean += bin * count
- self.mean /= self.total
- variance = 0
- for bin,count in zip(self.bin_edges, self.counts):
- bin += bin_width / 2.0
- variance += (bin - self.mean)**2 * count
- self.std_dev = numpy.sqrt(variance)
- def normalize(self):
- self.probabilities = []
- self.total = float(self.total)
- for count in self.counts:
- self.probabilities.append(count/self.total)
- def residual(self, other, type='bin-height', plot=False, title=None):
- """
- Each bin is weighted by the probability of unfolding in that
- bin for the self histogram. For residual types where there is
- a convention, this histogram is treated as the experimental
- histogram and the other histogram is treated as the
- theoretical one.
- """
- if type in ['jensen-shannon', 'chi-squared']:
- assert self.bin_edges == other.bin_edges
- if type == 'jensen-shannon':
- def d_KL_term(p,q):
- """
- Kullback-Leibler divergence for a single bin, with the
- exception that we return 0 if q==0, rather than
- exploding like d_KL should. We can do this because
- the way d_KL_term is used in the Jensen-Shannon
- divergence, if q (really m) == 0, then p also == 0,
- and the Jensen-Shannon divergence for the entire term
- should be zero.
- """
- if p==0 or q==0 or p==q:
- return 0.0
- return p * numpy.log2(p/q)
- residual = 0
- for probA,probB in zip(self.probabilities, other.probabilities):
- m = (probA+probB) / 2.0
- residual += 0.5*(d_KL_term(probA,m) + d_KL_term(probB,m))
- elif type == 'chi-squared':
- residual = 0
- for probA,probB in zip(self.probabilities, other.probabilities):
- residual += (probA-probB)**2 / probB
- elif type == 'mean':
- residual = (other.mean - self.mean)**2
- elif type == 'std-dev':
- residual = (other.std_dev - self.std_dev)**2
- else:
- raise NotImplementedError, type
- if plot == True:
- self.plot_residual_comparison(other, residual, title)
- return residual #*self.total # also weight by counts in our histogram
- def plot_residual_comparison(self, other, residual=None, title=None):
- if not hasattr(self, "_tag_i"):
- self._tag_i = 0
- self._tag_i += 1
-
- if residual == None:
- residual = self.residual(other)
- FIGURE.clear()
- p = pylab.plot(self.bin_edges[:-1], self.probabilities, 'r-',
- other.bin_edges[:-1], other.probabilities, 'b-')
- if title != None:
- pylab.title(title)
- FIGURE.savefig("residual-%g-%d.png" % (residual, self._tag_i))
-
class HistogramMatcher (object):
def __init__(self, velocity_file, param_format_string, dir_prefix="v_dep"):
self.experiment_histograms = self.read_force_histograms(velocity_file)
self.param_format_string = param_format_string
self.dir_prefix = dir_prefix
+
def read_force_histograms(self, v_file):
"""
v_file format:
histograms[v] = h
histograms[v].normalize()
return histograms
+
def dirname(self, params):
if not hasattr(self, "_tag_i"):
self._tag_i = 0
self._tag_i += 1
return "%s-%d" % (self.dir_prefix, self._tag_i)
+
def generate_sawsim_histograms(self, params, histograms,
run_repeat_simulation=True):
velocities = histograms.keys()
sawsim_histograms[velocity] = h
sawsim_histograms[velocity].normalize()
return sawsim_histograms
+
def param_string(self, params):
"""
Generate a string of non-velocity options to pass to sawsim
for a given parameter list.
"""
return self.param_format_string % (params[0], params[1])#tuple(params)
+
def get_all_unfolding_data(self, dirname, velocity_string):
datafile = os.path.join(dirname, "data_" + velocity_string)
return numpy.fromfile(datafile, sep=" ")
+
def residual(self, params, type='bin-height', plot=False,
run_repeat_simulation=True):
sawsim_histograms = self.generate_sawsim_histograms( \
title = None
if plot == True:
title = ", ".join(["%g" % p for p in params])
- residual += experiment_histogram.residual( \
- sawsim_histogram, type=type, plot=plot, title=title)
+ r = experiment_histogram.residual(sawsim_histogram, type=type)
+ residual += r
+ if plot == True:
+ filename = "residual-%s-%g.png" % (title.replace(', ','-'), r)
+ self.plot_residual_comparison(
+ experiment_histogram, sawsim_histogram, residual=r,
+ title=title, filename=filename)
print "residual", residual
del(sawsim_histograms)
return residual
+
def fit(self, initial_params, verbose=False):
p,cov,info,mesg,ier = leastsq(self.residual, initial_params,
full_output=True, maxfev=1000)
else:
print "Solution did not converge"
return p
+
def plot(self, param_ranges, logx=False, logy=False,
residual_type='bin-height',
plot_residuals=False, contour=False,
FIGURE.colorbar(p)
FIGURE.savefig("figure.png")
+ def plot_residual_comparison(self, expeiment_hist, theory_hist,
+ residual, title, filename):
+ FIGURE.clear()
+ p = pylab.plot(experiment_hist.bin_edges[:-1],
+ experiment_hist.probabilities, 'r-',
+ theory_hist.bin_edges[:-1],
+ theory_hist.probabilities, 'b-')
+ pylab.title(title)
+ FIGURE.savefig(filename)
+
+
def parse_param_ranges_string(string):
"""
'[Amin,Amax,Asteps],[Bmin,Bmax,Bsteps],...'
--- /dev/null
+# Copyright (C) 2009-2010 W. Trevor King <wking@drexel.edu>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# The author may be contacted at <wking@drexel.edu> on the Internet, or
+# write to Trevor King, Drexel University, Physics Dept., 3141 Chestnut St.,
+# Philadelphia PA 19104, USA.
+
+"""Histogram generation and comparison.
+"""
+
+import numpy
+
+
+class Histogram (object):
+ """A histogram with a flexible comparison method, `residual()`.
+
+ >>> h = Histogram()
+ """
+ def from_data(self, data, bin_edges):
+ """Initialize from `data`.
+
+ All bins should be of equal width (so we can calculate which
+ bin a data point belongs to).
+
+ `data` should be a numpy array.
+ """
+ self.bin_edges = bin_edges
+ bin_width = self.bin_edges[1] - self.bin_edges[0]
+
+ bin_is = numpy.floor((data - self.bin_edges[0])/bin_width)
+ self.counts = []
+ for i in range(len(self.bin_edges)-1):
+ self.counts.append(sum(bin_is == i))
+ self.total = float(len(data)) # some data might be outside the bins
+ self.mean = data.mean()
+ self.std_dev = data.std()
+ self.normalize()
+
+ def from_stream(self, stream):
+ """Initialize from `stream`.
+
+ File format:
+
+ # comment and blank lines ignored
+ <bin_edge><whitespace><count>
+ ...
+
+ For example:
+
+
+ `<bin_edge>` should mark the left-hand side of the bin, and
+ all bins should be of equal width (so we know where the last
+ one ends).
+
+ >>> import StringIO
+ >>> h = Histogram()
+ >>> h.from_stream(StringIO.StringIO('''# Force (N)\tUnfolding events
+ ... 150e-12\t10
+ ... 200e-12\t40
+ ... 250e-12\t5
+ ... '''))
+ >>> h.total
+ 55.0
+ >>> h.counts
+ [10.0, 40.0, 5.0]
+ >>> h.bin_edges # doctest: +ELLIPSIS
+ [1.5e-10, 2.000...e-10, 2.500...e-10, 3e-10]
+ >>> h.probabilities # doctest: +ELLIPSIS
+ [0.181..., 0.727..., 0.0909...]
+ """
+ self.bin_edges = []
+ self.counts = []
+ for line in stream.readlines():
+ line = line.strip()
+ if len(line) == 0 or line[0] == "#":
+ continue # ignore blank lines and comments
+ bin_edge,count = line.split()
+ self.bin_edges.append(float(bin_edge))
+ self.counts.append(float(count))
+ bin_width = self.bin_edges[1] - self.bin_edges[0]
+ self.bin_edges.append(self.bin_edges[-1]+bin_width)
+ self.total = float(sum(self.counts))
+ self.mean = 0
+ for bin,count in zip(self.bin_edges, self.counts):
+ bin += bin_width / 2.0
+ self.mean += bin * count
+ self.mean /= float(self.total)
+ variance = 0
+ for bin,count in zip(self.bin_edges, self.counts):
+ bin += bin_width / 2.0
+ variance += (bin - self.mean)**2 * count
+ self.std_dev = numpy.sqrt(variance)
+ self.normalize()
+
+ def normalize(self):
+ self.total = float(self.total)
+ self.probabilities = [count/self.total for count in self.counts]
+
+ def mean_residual(self, other):
+ return abs(other.mean - self.mean)
+
+ def std_dev_residual(self, other):
+ return abs(other.std_dev - self.std_dev)
+
+ def chi_squared_residual(self, other):
+ assert self.bin_edges == other.bin_edges
+ residual = 0
+ for probA,probB in zip(self.probabilities, other.probabilities):
+ residual += (probA-probB)**2 / probB
+ return residual
+
+ def jensen_shannon_residual(self, other):
+ assert self.bin_edges == other.bin_edges
+ def d_KL_term(p,q):
+ """
+ Kullback-Leibler divergence for a single bin, with the
+ exception that we return 0 if q==0, rather than
+ exploding like d_KL should. We can do this because
+ the way d_KL_term is used in the Jensen-Shannon
+ divergence, if q (really m) == 0, then p also == 0,
+ and the Jensen-Shannon divergence for the entire term
+ should be zero.
+ """
+ if p==0 or q==0 or p==q:
+ return 0.0
+ return p * numpy.log2(p/q)
+ residual = 0
+ for probA,probB in zip(self.probabilities, other.probabilities):
+ m = (probA+probB) / 2.0
+ residual += 0.5*(d_KL_term(probA,m) + d_KL_term(probB,m))
+ return residual
+
+ def _method_to_type(self, name):
+ return name[:-len('_residual')].replace('_', '-')
+
+ def _type_to_method(self, name):
+ return '%s_residual' % name.replace('-', '_')
+
+ def types(self):
+ """Return a list of supported residual types.
+ """
+ return sorted([self._method_to_type(x)
+ for x in dir(self) if x.endswith('_residual')])
+
+ def residual(self, other, type='jensen-shannon'):
+ """Compare this histogram with `other`.
+
+ Supported comparison `type`\s may be found with `types()`:
+
+ >>> h = Histogram()
+ >>> h.types()
+ ['chi-squared', 'jensen-shannon', 'mean', 'std-dev']
+
+ Selecting an invalid `type` raises an exception.
+
+ >>> h.residual(other=None, type='invalid-type')
+ Traceback (most recent call last):
+ ...
+ AttributeError: 'Histogram' object has no attribute 'invalid_type_residual'
+
+ For residual types where there is a convention, this histogram
+ is treated as the experimental histogram and the other
+ histogram is treated as the theoretical one.
+ """
+ r_method = getattr(self, self._type_to_method(type))
+ return r_method(other)