From 3d02d3a946bd43b0984c198e34504c96425ed43a Mon Sep 17 00:00:00 2001
From: "W. Trevor King" <wking@drexel.edu>
Date: Tue, 19 Oct 2010 10:16:50 -0400
Subject: [PATCH] Move Histogram class from fit_force_histogram to new
 pysawsim.histogram module.

---
 pysawsim/fit_force_histograms.py | 160 +++++++--------------------
 pysawsim/histogram.py            | 178 +++++++++++++++++++++++++++++++
 2 files changed, 214 insertions(+), 124 deletions(-)
 create mode 100644 pysawsim/histogram.py
diff --git a/pysawsim/fit_force_histograms.py b/pysawsim/fit_force_histograms.py
index 2b64926..87cc328 100755
--- a/pysawsim/fit_force_histograms.py
+++ b/pysawsim/fit_force_histograms.py
@@ -21,18 +21,22 @@
 import os # HACK, for getpid()
 import os.path
 import pickle
-import numpy
-from scipy.optimize import leastsq
 import shutil
 import sys
 
+
 import matplotlib
-matplotlib.use('Agg') # select backend that doesn't require X Windows
+import numpy
 import pylab
+from scipy.optimize import leastsq
+
+
+matplotlib.use('Agg')  # select backend that doesn't require X Windows
+FIGURE = pylab.figure()  # avoid memory problems.
+"""`pylab` keeps internal references to all created figures, so share
+a single instance.
+"""
 
-FIGURE = pylab.figure() # avoid memory problems.
-# pylab keeps internal references to all created figures, so share a
-# single instance.
 MEM_DEBUG = False
 
 def rss():
@@ -46,128 +50,13 @@ def rss():
     status,stdout,stderr = invoke(call)
     return int(stdout)
 
-class Histogram (object):
-    def from_data(self, data, bin_edges):
-        """
-        All bins should be of equal width (so we can calculate which
-        bin a data point belongs to).
-
-        data should be a numpy array.
-        """
-        self.bin_edges = bin_edges
-        bin_width = self.bin_edges[1] - self.bin_edges[0]
-
-        bin_is = numpy.floor((data - self.bin_edges[0])/bin_width)
-        self.counts = []
-        for i in range(len(self.bin_edges)-1):
-            self.counts.append(sum(bin_is == i))
-        self.total = float(len(data)) # because some data might be outside the bins
-        self.mean = data.mean()
-        self.std_dev = data.std()
-    def from_file(self, h_file):
-        """
-        h_file format:
-        # comment and blank lines ignored
-        <unfolding force in N><TAB><count>
-        ...
-
-        e.g.
-
-        150e-12 10
-        200e-12 40
-        250e-12 5
-
-        The unfolding force should mark the left-hand side of the bin, and
-        all bins should be of equal width (so we know where the last one ends).
-        """
-        self.bin_edges = []
-        self.counts = []
-        for line in file(h_file, "r").readlines():
-            line = line.strip()
-            if len(line) == 0 or line[0] == "#":
-                continue # ignore blank lines and comments
-            force,count = line.split() #"\t")
-            self.bin_edges.append(float(force))
-            self.counts.append(float(count))
-        bin_width = self.bin_edges[1] - self.bin_edges[0]
-        self.bin_edges.append(self.counts[-1]+bin_width)
-        self.total = float(sum(self.counts))
-        self.mean = 0
-        for bin,count in zip(self.bin_edges, self.counts):
-            bin += bin_width / 2.0
-            self.mean += bin * count
-        self.mean /=  self.total
-        variance = 0
-        for bin,count in zip(self.bin_edges, self.counts):
-            bin += bin_width / 2.0
-            variance += (bin - self.mean)**2 * count
-        self.std_dev = numpy.sqrt(variance)
-    def normalize(self):
-        self.probabilities = []
-        self.total = float(self.total)
-        for count in self.counts:
-            self.probabilities.append(count/self.total)
-    def residual(self, other, type='bin-height', plot=False, title=None):
-        """
-        Each bin is weighted by the probability of unfolding in that
-        bin for the self histogram.  For residual types where there is
-        a convention, this histogram is treated as the experimental
-        histogram and the other histogram is treated as the
-        theoretical one.
-        """
-        if type in ['jensen-shannon', 'chi-squared']:
-            assert self.bin_edges == other.bin_edges
-        if type == 'jensen-shannon':
-            def d_KL_term(p,q):
-                """
-                Kullback-Leibler divergence for a single bin, with the
-                exception that we return 0 if q==0, rather than
-                exploding like d_KL should.  We can do this because
-                the way d_KL_term is used in the Jensen-Shannon
-                divergence, if q (really m) == 0, then p also == 0,
-                and the Jensen-Shannon divergence for the entire term
-                should be zero.
-                """
-                if p==0 or q==0 or p==q:
-                    return 0.0
-                return p * numpy.log2(p/q)
-            residual = 0
-            for probA,probB in zip(self.probabilities, other.probabilities):
-                m = (probA+probB) / 2.0
-                residual += 0.5*(d_KL_term(probA,m) + d_KL_term(probB,m))
-        elif type == 'chi-squared':
-            residual = 0
-            for probA,probB in zip(self.probabilities, other.probabilities):
-                residual += (probA-probB)**2 / probB
-        elif type == 'mean':
-            residual = (other.mean - self.mean)**2
-        elif type == 'std-dev':
-            residual = (other.std_dev - self.std_dev)**2
-        else:
-            raise NotImplementedError, type
-        if plot == True:
-            self.plot_residual_comparison(other, residual, title)
-        return residual #*self.total # also weight by counts in our histogram
-    def plot_residual_comparison(self, other, residual=None, title=None):
-        if not hasattr(self, "_tag_i"):
-            self._tag_i = 0
-        self._tag_i += 1
-
-        if residual == None:
-            residual = self.residual(other)
-        FIGURE.clear()
-        p = pylab.plot(self.bin_edges[:-1], self.probabilities, 'r-',
-                       other.bin_edges[:-1], other.probabilities, 'b-')
-        if title != None:
-            pylab.title(title)
-        FIGURE.savefig("residual-%g-%d.png" % (residual, self._tag_i))
-
 
 class HistogramMatcher (object):
     def __init__(self, velocity_file, param_format_string, dir_prefix="v_dep"):
         self.experiment_histograms = self.read_force_histograms(velocity_file)
         self.param_format_string = param_format_string
         self.dir_prefix = dir_prefix
+
     def read_force_histograms(self, v_file):
         """
         v_file format:
@@ -192,11 +81,13 @@ class HistogramMatcher (object):
             histograms[v] = h
             histograms[v].normalize()
         return histograms
+
     def dirname(self, params):
         if not hasattr(self, "_tag_i"):
             self._tag_i = 0
         self._tag_i += 1
         return "%s-%d" % (self.dir_prefix, self._tag_i)
+
     def generate_sawsim_histograms(self, params, histograms,
                                    run_repeat_simulation=True):
         velocities = histograms.keys()
@@ -219,15 +110,18 @@ class HistogramMatcher (object):
             sawsim_histograms[velocity] = h
             sawsim_histograms[velocity].normalize()
         return sawsim_histograms
+
     def param_string(self, params):
         """
         Generate a string of non-velocity options to pass to sawsim
         for a given parameter list.
         """
         return self.param_format_string % (params[0], params[1])#tuple(params)
+
     def get_all_unfolding_data(self, dirname, velocity_string):
         datafile = os.path.join(dirname, "data_" + velocity_string)
         return numpy.fromfile(datafile, sep=" ")
+
     def residual(self, params, type='bin-height', plot=False,
                  run_repeat_simulation=True):
         sawsim_histograms = self.generate_sawsim_histograms( \
@@ -240,11 +134,17 @@ class HistogramMatcher (object):
             title = None
             if plot == True:
                 title = ", ".join(["%g" % p for p in params])
-            residual += experiment_histogram.residual( \
-                sawsim_histogram, type=type, plot=plot, title=title)
+            r = experiment_histogram.residual(sawsim_histogram, type=type)
+            residual += r
+            if plot == True:
+                filename = "residual-%s-%g.png" % (title.replace(', ','-'), r)
+                self.plot_residual_comparison(
+                    experiment_histogram, sawsim_histogram, residual=r,
+                    title=title, filename=filename)
         print "residual", residual
         del(sawsim_histograms)
         return residual
+
     def fit(self, initial_params, verbose=False):
         p,cov,info,mesg,ier = leastsq(self.residual, initial_params,
                                       full_output=True, maxfev=1000)
@@ -258,6 +158,7 @@ class HistogramMatcher (object):
             else:
                 print "Solution did not converge"
         return p
+
     def plot(self, param_ranges, logx=False, logy=False,
              residual_type='bin-height',
              plot_residuals=False, contour=False,
@@ -309,6 +210,17 @@ class HistogramMatcher (object):
         FIGURE.colorbar(p)
         FIGURE.savefig("figure.png")
 
+    def plot_residual_comparison(self, expeiment_hist, theory_hist,
+                                 residual, title, filename):
+        FIGURE.clear()
+        p = pylab.plot(experiment_hist.bin_edges[:-1],
+                       experiment_hist.probabilities, 'r-',
+                       theory_hist.bin_edges[:-1],
+                       theory_hist.probabilities, 'b-')
+        pylab.title(title)
+        FIGURE.savefig(filename)
+
+
 def parse_param_ranges_string(string):
     """
     '[Amin,Amax,Asteps],[Bmin,Bmax,Bsteps],...'
diff --git a/pysawsim/histogram.py b/pysawsim/histogram.py
new file mode 100644
index 0000000..9f2f65d
--- /dev/null
+++ b/pysawsim/histogram.py
@@ -0,0 +1,178 @@
+# Copyright (C) 2009-2010  W. Trevor King <wking@drexel.edu>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# The author may be contacted at <wking@drexel.edu> on the Internet, or
+# write to Trevor King, Drexel University, Physics Dept., 3141 Chestnut St.,
+# Philadelphia PA 19104, USA.
+
+"""Histogram generation and comparison.
+"""
+
+import numpy
+
+
+class Histogram (object):
+    """A histogram with a flexible comparison method, `residual()`.
+
+    >>> h = Histogram()
+    """
+    def from_data(self, data, bin_edges):
+        """Initialize from `data`.
+
+        All bins should be of equal width (so we can calculate which
+        bin a data point belongs to).
+
+        `data` should be a numpy array.
+        """
+        self.bin_edges = bin_edges
+        bin_width = self.bin_edges[1] - self.bin_edges[0]
+
+        bin_is = numpy.floor((data - self.bin_edges[0])/bin_width)
+        self.counts = []
+        for i in range(len(self.bin_edges)-1):
+            self.counts.append(sum(bin_is == i))
+        self.total = float(len(data)) # some data might be outside the bins
+        self.mean = data.mean()
+        self.std_dev = data.std()
+        self.normalize()
+
+    def from_stream(self, stream):
+        """Initialize from `stream`.
+
+        File format:
+
+            # comment and blank lines ignored
+            <bin_edge><whitespace><count>
+            ...
+
+        For example:
+
+
+        `<bin_edge>` should mark the left-hand side of the bin, and
+        all bins should be of equal width (so we know where the last
+        one ends).
+
+        >>> import StringIO
+        >>> h = Histogram()
+        >>> h.from_stream(StringIO.StringIO('''# Force (N)\tUnfolding events
+        ... 150e-12\t10
+        ... 200e-12\t40
+        ... 250e-12\t5
+        ... '''))
+        >>> h.total
+        55.0
+        >>> h.counts
+        [10.0, 40.0, 5.0]
+        >>> h.bin_edges  # doctest: +ELLIPSIS
+        [1.5e-10, 2.000...e-10, 2.500...e-10, 3e-10]
+        >>> h.probabilities  # doctest: +ELLIPSIS
+        [0.181..., 0.727..., 0.0909...]
+        """
+        self.bin_edges = []
+        self.counts = []
+        for line in stream.readlines():
+            line = line.strip()
+            if len(line) == 0 or line[0] == "#":
+                continue # ignore blank lines and comments
+            bin_edge,count = line.split()
+            self.bin_edges.append(float(bin_edge))
+            self.counts.append(float(count))
+        bin_width = self.bin_edges[1] - self.bin_edges[0]
+        self.bin_edges.append(self.bin_edges[-1]+bin_width)
+        self.total = float(sum(self.counts))
+        self.mean = 0
+        for bin,count in zip(self.bin_edges, self.counts):
+            bin += bin_width / 2.0
+            self.mean += bin * count
+        self.mean /=  float(self.total)
+        variance = 0
+        for bin,count in zip(self.bin_edges, self.counts):
+            bin += bin_width / 2.0
+            variance += (bin - self.mean)**2 * count
+        self.std_dev = numpy.sqrt(variance)
+        self.normalize()
+
+    def normalize(self):
+        self.total = float(self.total)
+        self.probabilities = [count/self.total for count in self.counts]
+
+    def mean_residual(self, other):
+        return abs(other.mean - self.mean)
+
+    def std_dev_residual(self, other):
+        return abs(other.std_dev - self.std_dev)
+
+    def chi_squared_residual(self, other):
+        assert self.bin_edges == other.bin_edges
+        residual = 0
+        for probA,probB in zip(self.probabilities, other.probabilities):
+            residual += (probA-probB)**2 / probB
+        return residual
+
+    def jensen_shannon_residual(self, other):
+        assert self.bin_edges == other.bin_edges
+        def d_KL_term(p,q):
+            """
+            Kullback-Leibler divergence for a single bin, with the
+            exception that we return 0 if q==0, rather than
+            exploding like d_KL should.  We can do this because
+            the way d_KL_term is used in the Jensen-Shannon
+            divergence, if q (really m) == 0, then p also == 0,
+            and the Jensen-Shannon divergence for the entire term
+            should be zero.
+            """
+            if p==0 or q==0 or p==q:
+                return 0.0
+            return p * numpy.log2(p/q)
+        residual = 0
+        for probA,probB in zip(self.probabilities, other.probabilities):
+            m = (probA+probB) / 2.0
+            residual += 0.5*(d_KL_term(probA,m) + d_KL_term(probB,m))
+        return residual
+
+    def _method_to_type(self, name):
+        return name[:-len('_residual')].replace('_', '-')
+
+    def _type_to_method(self, name):
+        return '%s_residual' % name.replace('-', '_')
+
+    def types(self):
+        """Return a list of supported residual types.
+        """
+        return sorted([self._method_to_type(x)
+                       for x in dir(self) if x.endswith('_residual')])
+
+    def residual(self, other, type='jensen-shannon'):
+        """Compare this histogram with `other`.
+
+        Supported comparison `type`\s may be found with `types()`:
+
+        >>> h = Histogram()
+        >>> h.types()
+        ['chi-squared', 'jensen-shannon', 'mean', 'std-dev']
+
+        Selecting an invalid `type` raises an exception.
+
+        >>> h.residual(other=None, type='invalid-type')
+        Traceback (most recent call last):
+          ...
+        AttributeError: 'Histogram' object has no attribute 'invalid_type_residual'
+
+        For residual types where there is a convention, this histogram
+        is treated as the experimental histogram and the other
+        histogram is treated as the theoretical one.
+        """
+        r_method = getattr(self, self._type_to_method(type))
+        return r_method(other)
-- 
2.26.2