From 3d02d3a946bd43b0984c198e34504c96425ed43a Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Tue, 19 Oct 2010 10:16:50 -0400 Subject: [PATCH] Move Histogram class from fit_force_histogram to new pysawsim.histogram module. --- pysawsim/fit_force_histograms.py | 160 +++++++-------------------- pysawsim/histogram.py | 178 +++++++++++++++++++++++++++++++ 2 files changed, 214 insertions(+), 124 deletions(-) create mode 100644 pysawsim/histogram.py diff --git a/pysawsim/fit_force_histograms.py b/pysawsim/fit_force_histograms.py index 2b64926..87cc328 100755 --- a/pysawsim/fit_force_histograms.py +++ b/pysawsim/fit_force_histograms.py @@ -21,18 +21,22 @@ import os # HACK, for getpid() import os.path import pickle -import numpy -from scipy.optimize import leastsq import shutil import sys + import matplotlib -matplotlib.use('Agg') # select backend that doesn't require X Windows +import numpy import pylab +from scipy.optimize import leastsq + + +matplotlib.use('Agg') # select backend that doesn't require X Windows +FIGURE = pylab.figure() # avoid memory problems. +"""`pylab` keeps internal references to all created figures, so share +a single instance. +""" -FIGURE = pylab.figure() # avoid memory problems. -# pylab keeps internal references to all created figures, so share a -# single instance. MEM_DEBUG = False def rss(): @@ -46,128 +50,13 @@ def rss(): status,stdout,stderr = invoke(call) return int(stdout) -class Histogram (object): - def from_data(self, data, bin_edges): - """ - All bins should be of equal width (so we can calculate which - bin a data point belongs to). - - data should be a numpy array. - """ - self.bin_edges = bin_edges - bin_width = self.bin_edges[1] - self.bin_edges[0] - - bin_is = numpy.floor((data - self.bin_edges[0])/bin_width) - self.counts = [] - for i in range(len(self.bin_edges)-1): - self.counts.append(sum(bin_is == i)) - self.total = float(len(data)) # because some data might be outside the bins - self.mean = data.mean() - self.std_dev = data.std() - def from_file(self, h_file): - """ - h_file format: - # comment and blank lines ignored - - ... - - e.g. - - 150e-12 10 - 200e-12 40 - 250e-12 5 - - The unfolding force should mark the left-hand side of the bin, and - all bins should be of equal width (so we know where the last one ends). - """ - self.bin_edges = [] - self.counts = [] - for line in file(h_file, "r").readlines(): - line = line.strip() - if len(line) == 0 or line[0] == "#": - continue # ignore blank lines and comments - force,count = line.split() #"\t") - self.bin_edges.append(float(force)) - self.counts.append(float(count)) - bin_width = self.bin_edges[1] - self.bin_edges[0] - self.bin_edges.append(self.counts[-1]+bin_width) - self.total = float(sum(self.counts)) - self.mean = 0 - for bin,count in zip(self.bin_edges, self.counts): - bin += bin_width / 2.0 - self.mean += bin * count - self.mean /= self.total - variance = 0 - for bin,count in zip(self.bin_edges, self.counts): - bin += bin_width / 2.0 - variance += (bin - self.mean)**2 * count - self.std_dev = numpy.sqrt(variance) - def normalize(self): - self.probabilities = [] - self.total = float(self.total) - for count in self.counts: - self.probabilities.append(count/self.total) - def residual(self, other, type='bin-height', plot=False, title=None): - """ - Each bin is weighted by the probability of unfolding in that - bin for the self histogram. For residual types where there is - a convention, this histogram is treated as the experimental - histogram and the other histogram is treated as the - theoretical one. - """ - if type in ['jensen-shannon', 'chi-squared']: - assert self.bin_edges == other.bin_edges - if type == 'jensen-shannon': - def d_KL_term(p,q): - """ - Kullback-Leibler divergence for a single bin, with the - exception that we return 0 if q==0, rather than - exploding like d_KL should. We can do this because - the way d_KL_term is used in the Jensen-Shannon - divergence, if q (really m) == 0, then p also == 0, - and the Jensen-Shannon divergence for the entire term - should be zero. - """ - if p==0 or q==0 or p==q: - return 0.0 - return p * numpy.log2(p/q) - residual = 0 - for probA,probB in zip(self.probabilities, other.probabilities): - m = (probA+probB) / 2.0 - residual += 0.5*(d_KL_term(probA,m) + d_KL_term(probB,m)) - elif type == 'chi-squared': - residual = 0 - for probA,probB in zip(self.probabilities, other.probabilities): - residual += (probA-probB)**2 / probB - elif type == 'mean': - residual = (other.mean - self.mean)**2 - elif type == 'std-dev': - residual = (other.std_dev - self.std_dev)**2 - else: - raise NotImplementedError, type - if plot == True: - self.plot_residual_comparison(other, residual, title) - return residual #*self.total # also weight by counts in our histogram - def plot_residual_comparison(self, other, residual=None, title=None): - if not hasattr(self, "_tag_i"): - self._tag_i = 0 - self._tag_i += 1 - - if residual == None: - residual = self.residual(other) - FIGURE.clear() - p = pylab.plot(self.bin_edges[:-1], self.probabilities, 'r-', - other.bin_edges[:-1], other.probabilities, 'b-') - if title != None: - pylab.title(title) - FIGURE.savefig("residual-%g-%d.png" % (residual, self._tag_i)) - class HistogramMatcher (object): def __init__(self, velocity_file, param_format_string, dir_prefix="v_dep"): self.experiment_histograms = self.read_force_histograms(velocity_file) self.param_format_string = param_format_string self.dir_prefix = dir_prefix + def read_force_histograms(self, v_file): """ v_file format: @@ -192,11 +81,13 @@ class HistogramMatcher (object): histograms[v] = h histograms[v].normalize() return histograms + def dirname(self, params): if not hasattr(self, "_tag_i"): self._tag_i = 0 self._tag_i += 1 return "%s-%d" % (self.dir_prefix, self._tag_i) + def generate_sawsim_histograms(self, params, histograms, run_repeat_simulation=True): velocities = histograms.keys() @@ -219,15 +110,18 @@ class HistogramMatcher (object): sawsim_histograms[velocity] = h sawsim_histograms[velocity].normalize() return sawsim_histograms + def param_string(self, params): """ Generate a string of non-velocity options to pass to sawsim for a given parameter list. """ return self.param_format_string % (params[0], params[1])#tuple(params) + def get_all_unfolding_data(self, dirname, velocity_string): datafile = os.path.join(dirname, "data_" + velocity_string) return numpy.fromfile(datafile, sep=" ") + def residual(self, params, type='bin-height', plot=False, run_repeat_simulation=True): sawsim_histograms = self.generate_sawsim_histograms( \ @@ -240,11 +134,17 @@ class HistogramMatcher (object): title = None if plot == True: title = ", ".join(["%g" % p for p in params]) - residual += experiment_histogram.residual( \ - sawsim_histogram, type=type, plot=plot, title=title) + r = experiment_histogram.residual(sawsim_histogram, type=type) + residual += r + if plot == True: + filename = "residual-%s-%g.png" % (title.replace(', ','-'), r) + self.plot_residual_comparison( + experiment_histogram, sawsim_histogram, residual=r, + title=title, filename=filename) print "residual", residual del(sawsim_histograms) return residual + def fit(self, initial_params, verbose=False): p,cov,info,mesg,ier = leastsq(self.residual, initial_params, full_output=True, maxfev=1000) @@ -258,6 +158,7 @@ class HistogramMatcher (object): else: print "Solution did not converge" return p + def plot(self, param_ranges, logx=False, logy=False, residual_type='bin-height', plot_residuals=False, contour=False, @@ -309,6 +210,17 @@ class HistogramMatcher (object): FIGURE.colorbar(p) FIGURE.savefig("figure.png") + def plot_residual_comparison(self, expeiment_hist, theory_hist, + residual, title, filename): + FIGURE.clear() + p = pylab.plot(experiment_hist.bin_edges[:-1], + experiment_hist.probabilities, 'r-', + theory_hist.bin_edges[:-1], + theory_hist.probabilities, 'b-') + pylab.title(title) + FIGURE.savefig(filename) + + def parse_param_ranges_string(string): """ '[Amin,Amax,Asteps],[Bmin,Bmax,Bsteps],...' diff --git a/pysawsim/histogram.py b/pysawsim/histogram.py new file mode 100644 index 0000000..9f2f65d --- /dev/null +++ b/pysawsim/histogram.py @@ -0,0 +1,178 @@ +# Copyright (C) 2009-2010 W. Trevor King +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# The author may be contacted at on the Internet, or +# write to Trevor King, Drexel University, Physics Dept., 3141 Chestnut St., +# Philadelphia PA 19104, USA. + +"""Histogram generation and comparison. +""" + +import numpy + + +class Histogram (object): + """A histogram with a flexible comparison method, `residual()`. + + >>> h = Histogram() + """ + def from_data(self, data, bin_edges): + """Initialize from `data`. + + All bins should be of equal width (so we can calculate which + bin a data point belongs to). + + `data` should be a numpy array. + """ + self.bin_edges = bin_edges + bin_width = self.bin_edges[1] - self.bin_edges[0] + + bin_is = numpy.floor((data - self.bin_edges[0])/bin_width) + self.counts = [] + for i in range(len(self.bin_edges)-1): + self.counts.append(sum(bin_is == i)) + self.total = float(len(data)) # some data might be outside the bins + self.mean = data.mean() + self.std_dev = data.std() + self.normalize() + + def from_stream(self, stream): + """Initialize from `stream`. + + File format: + + # comment and blank lines ignored + + ... + + For example: + + + `` should mark the left-hand side of the bin, and + all bins should be of equal width (so we know where the last + one ends). + + >>> import StringIO + >>> h = Histogram() + >>> h.from_stream(StringIO.StringIO('''# Force (N)\tUnfolding events + ... 150e-12\t10 + ... 200e-12\t40 + ... 250e-12\t5 + ... ''')) + >>> h.total + 55.0 + >>> h.counts + [10.0, 40.0, 5.0] + >>> h.bin_edges # doctest: +ELLIPSIS + [1.5e-10, 2.000...e-10, 2.500...e-10, 3e-10] + >>> h.probabilities # doctest: +ELLIPSIS + [0.181..., 0.727..., 0.0909...] + """ + self.bin_edges = [] + self.counts = [] + for line in stream.readlines(): + line = line.strip() + if len(line) == 0 or line[0] == "#": + continue # ignore blank lines and comments + bin_edge,count = line.split() + self.bin_edges.append(float(bin_edge)) + self.counts.append(float(count)) + bin_width = self.bin_edges[1] - self.bin_edges[0] + self.bin_edges.append(self.bin_edges[-1]+bin_width) + self.total = float(sum(self.counts)) + self.mean = 0 + for bin,count in zip(self.bin_edges, self.counts): + bin += bin_width / 2.0 + self.mean += bin * count + self.mean /= float(self.total) + variance = 0 + for bin,count in zip(self.bin_edges, self.counts): + bin += bin_width / 2.0 + variance += (bin - self.mean)**2 * count + self.std_dev = numpy.sqrt(variance) + self.normalize() + + def normalize(self): + self.total = float(self.total) + self.probabilities = [count/self.total for count in self.counts] + + def mean_residual(self, other): + return abs(other.mean - self.mean) + + def std_dev_residual(self, other): + return abs(other.std_dev - self.std_dev) + + def chi_squared_residual(self, other): + assert self.bin_edges == other.bin_edges + residual = 0 + for probA,probB in zip(self.probabilities, other.probabilities): + residual += (probA-probB)**2 / probB + return residual + + def jensen_shannon_residual(self, other): + assert self.bin_edges == other.bin_edges + def d_KL_term(p,q): + """ + Kullback-Leibler divergence for a single bin, with the + exception that we return 0 if q==0, rather than + exploding like d_KL should. We can do this because + the way d_KL_term is used in the Jensen-Shannon + divergence, if q (really m) == 0, then p also == 0, + and the Jensen-Shannon divergence for the entire term + should be zero. + """ + if p==0 or q==0 or p==q: + return 0.0 + return p * numpy.log2(p/q) + residual = 0 + for probA,probB in zip(self.probabilities, other.probabilities): + m = (probA+probB) / 2.0 + residual += 0.5*(d_KL_term(probA,m) + d_KL_term(probB,m)) + return residual + + def _method_to_type(self, name): + return name[:-len('_residual')].replace('_', '-') + + def _type_to_method(self, name): + return '%s_residual' % name.replace('-', '_') + + def types(self): + """Return a list of supported residual types. + """ + return sorted([self._method_to_type(x) + for x in dir(self) if x.endswith('_residual')]) + + def residual(self, other, type='jensen-shannon'): + """Compare this histogram with `other`. + + Supported comparison `type`\s may be found with `types()`: + + >>> h = Histogram() + >>> h.types() + ['chi-squared', 'jensen-shannon', 'mean', 'std-dev'] + + Selecting an invalid `type` raises an exception. + + >>> h.residual(other=None, type='invalid-type') + Traceback (most recent call last): + ... + AttributeError: 'Histogram' object has no attribute 'invalid_type_residual' + + For residual types where there is a convention, this histogram + is treated as the experimental histogram and the other + histogram is treated as the theoretical one. + """ + r_method = getattr(self, self._type_to_method(type)) + return r_method(other) -- 2.26.2