pysawsim/histogram.py

   1 # Copyright (C) 2009-2010  W. Trevor King <wking@drexel.edu>
   2 #
   3 # This program is free software: you can redistribute it and/or modify
   4 # it under the terms of the GNU General Public License as published by
   5 # the Free Software Foundation, either version 3 of the License, or
   6 # (at your option) any later version.
   7 #
   8 # This program is distributed in the hope that it will be useful,
   9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11 # GNU General Public License for more details.
  12 #
  13 # You should have received a copy of the GNU General Public License
  14 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  15 #
  16 # The author may be contacted at <wking@drexel.edu> on the Internet, or
  17 # write to Trevor King, Drexel University, Physics Dept., 3141 Chestnut St.,
  18 # Philadelphia PA 19104, USA.
  19
  20 """Histogram generation and comparison.
  21 """
  22
  23 import numpy
  24
  25
  26 class Histogram (object):
  27     """A histogram with a flexible comparison method, `residual()`.
  28
  29     >>> h = Histogram()
  30     """
  31     def from_data(self, data, bin_edges):
  32         """Initialize from `data`.
  33
  34         All bins should be of equal width (so we can calculate which
  35         bin a data point belongs to).
  36
  37         `data` should be a numpy array.
  38         """
  39         self.bin_edges = bin_edges
  40         bin_width = self.bin_edges[1] - self.bin_edges[0]
  41
  42         bin_is = numpy.floor((data - self.bin_edges[0])/bin_width)
  43         self.counts = []
  44         for i in range(len(self.bin_edges)-1):
  45             self.counts.append(sum(bin_is == i))
  46         self.total = float(len(data)) # some data might be outside the bins
  47         self.mean = data.mean()
  48         self.std_dev = data.std()
  49         self.normalize()
  50
  51     def from_stream(self, stream):
  52         """Initialize from `stream`.
  53
  54         File format:
  55
  56             # comment and blank lines ignored
  57             <bin_edge><whitespace><count>
  58             ...
  59
  60         For example:
  61
  62
  63         `<bin_edge>` should mark the left-hand side of the bin, and
  64         all bins should be of equal width (so we know where the last
  65         one ends).
  66
  67         >>> import StringIO
  68         >>> h = Histogram()
  69         >>> h.from_stream(StringIO.StringIO('''# Force (N)\tUnfolding events
  70         ... 150e-12\t10
  71         ... 200e-12\t40
  72         ... 250e-12\t5
  73         ... '''))
  74         >>> h.total
  75         55.0
  76         >>> h.counts
  77         [10.0, 40.0, 5.0]
  78         >>> h.bin_edges  # doctest: +ELLIPSIS
  79         [1.5e-10, 2.000...e-10, 2.500...e-10, 3e-10]
  80         >>> h.probabilities  # doctest: +ELLIPSIS
  81         [0.181..., 0.727..., 0.0909...]
  82         """
  83         self.bin_edges = []
  84         self.counts = []
  85         for line in stream.readlines():
  86             line = line.strip()
  87             if len(line) == 0 or line[0] == "#":
  88                 continue # ignore blank lines and comments
  89             bin_edge,count = line.split()
  90             self.bin_edges.append(float(bin_edge))
  91             self.counts.append(float(count))
  92         bin_width = self.bin_edges[1] - self.bin_edges[0]
  93         self.bin_edges.append(self.bin_edges[-1]+bin_width)
  94         self.total = float(sum(self.counts))
  95         self.mean = 0
  96         for bin,count in zip(self.bin_edges, self.counts):
  97             bin += bin_width / 2.0
  98             self.mean += bin * count
  99         self.mean /=  float(self.total)
 100         variance = 0
 101         for bin,count in zip(self.bin_edges, self.counts):
 102             bin += bin_width / 2.0
 103             variance += (bin - self.mean)**2 * count
 104         self.std_dev = numpy.sqrt(variance)
 105         self.normalize()
 106
 107     def normalize(self):
 108         self.total = float(self.total)
 109         self.probabilities = [count/self.total for count in self.counts]
 110
 111     def mean_residual(self, other):
 112         return abs(other.mean - self.mean)
 113
 114     def std_dev_residual(self, other):
 115         return abs(other.std_dev - self.std_dev)
 116
 117     def chi_squared_residual(self, other):
 118         assert self.bin_edges == other.bin_edges
 119         residual = 0
 120         for probA,probB in zip(self.probabilities, other.probabilities):
 121             residual += (probA-probB)**2 / probB
 122         return residual
 123
 124     def jensen_shannon_residual(self, other):
 125         assert self.bin_edges == other.bin_edges
 126         def d_KL_term(p,q):
 127             """
 128             Kullback-Leibler divergence for a single bin, with the
 129             exception that we return 0 if q==0, rather than
 130             exploding like d_KL should.  We can do this because
 131             the way d_KL_term is used in the Jensen-Shannon
 132             divergence, if q (really m) == 0, then p also == 0,
 133             and the Jensen-Shannon divergence for the entire term
 134             should be zero.
 135             """
 136             if p==0 or q==0 or p==q:
 137                 return 0.0
 138             return p * numpy.log2(p/q)
 139         residual = 0
 140         for probA,probB in zip(self.probabilities, other.probabilities):
 141             m = (probA+probB) / 2.0
 142             residual += 0.5*(d_KL_term(probA,m) + d_KL_term(probB,m))
 143         return residual
 144
 145     def _method_to_type(self, name):
 146         return name[:-len('_residual')].replace('_', '-')
 147
 148     def _type_to_method(self, name):
 149         return '%s_residual' % name.replace('-', '_')
 150
 151     def types(self):
 152         """Return a list of supported residual types.
 153         """
 154         return sorted([self._method_to_type(x)
 155                        for x in dir(self) if x.endswith('_residual')])
 156
 157     def residual(self, other, type='jensen-shannon'):
 158         """Compare this histogram with `other`.
 159
 160         Supported comparison `type`\s may be found with `types()`:
 161
 162         >>> h = Histogram()
 163         >>> h.types()
 164         ['chi-squared', 'jensen-shannon', 'mean', 'std-dev']
 165
 166         Selecting an invalid `type` raises an exception.
 167
 168         >>> h.residual(other=None, type='invalid-type')
 169         Traceback (most recent call last):
 170           ...
 171         AttributeError: 'Histogram' object has no attribute 'invalid_type_residual'
 172
 173         For residual types where there is a convention, this histogram
 174         is treated as the experimental histogram and the other
 175         histogram is treated as the theoretical one.
 176         """
 177         r_method = getattr(self, self._type_to_method(type))
 178         return r_method(other)