pysawsim/histogram.py

   1 # Copyright (C) 2009-2010  W. Trevor King <wking@drexel.edu>
   2 #
   3 # This program is free software: you can redistribute it and/or modify
   4 # it under the terms of the GNU General Public License as published by
   5 # the Free Software Foundation, either version 3 of the License, or
   6 # (at your option) any later version.
   7 #
   8 # This program is distributed in the hope that it will be useful,
   9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11 # GNU General Public License for more details.
  12 #
  13 # You should have received a copy of the GNU General Public License
  14 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  15 #
  16 # The author may be contacted at <wking@drexel.edu> on the Internet, or
  17 # write to Trevor King, Drexel University, Physics Dept., 3141 Chestnut St.,
  18 # Philadelphia PA 19104, USA.
  19
  20 """Histogram generation and comparison.
  21 """
  22
  23 import numpy
  24
  25 from . import log
  26
  27
  28 class Histogram (object):
  29     """A histogram with a flexible comparison method, `residual()`.
  30
  31     >>> h = Histogram()
  32     """
  33     def calculate_bin_edges(self, data, bin_width):
  34         """
  35         >>> h = Histogram()
  36         >>> h.calculate_bin_edges(numpy.array([-7.5, 18.2, 4]), 10)
  37         array([-10.,   0.,  10.,  20.])
  38         >>> h.calculate_bin_edges(numpy.array([-7.5, 18.2, 4, 20]), 10)
  39         array([-10.,   0.,  10.,  20.])
  40         >>> h.calculate_bin_edges(numpy.array([0, 18.2, 4, 20]), 10)
  41         array([  0.,  10.,  20.])
  42         >>> h.calculate_bin_edges(numpy.array([18.2, 4, 20]), 10)
  43         array([  0.,  10.,  20.])
  44         >>> h.calculate_bin_edges(numpy.array([18.2, 20]), 10)
  45         array([ 10.,  20.])
  46         """
  47         m = data.min()
  48         M = data.max()
  49         bin_start = m - m % bin_width
  50         return numpy.arange(bin_start, M+bin_width, bin_width, dtype=data.dtype)
  51
  52     def from_data(self, data, bin_edges):
  53         """Initialize from `data`.
  54
  55         All bins should be of equal width (so we can calculate which
  56         bin a data point belongs to).
  57
  58         `data` should be a numpy array.
  59         """
  60         self.headings = None
  61         self.bin_edges = bin_edges
  62         bin_width = self.bin_edges[1] - self.bin_edges[0]
  63
  64         bin_is = numpy.floor((data - self.bin_edges[0])/bin_width)
  65         self.counts = []
  66         for i in range(len(self.bin_edges)-1):
  67             self.counts.append(sum(bin_is == i))
  68         self.total = float(len(data)) # some data might be outside the bins
  69         self.mean = data.mean()
  70         self.std_dev = data.std()
  71         self.normalize()
  72
  73     def from_stream(self, stream):
  74         """Initialize from `stream`.
  75
  76         File format:
  77
  78             # comment and blank lines ignored
  79             <bin_edge><whitespace><count>
  80             ...
  81
  82         `<bin_edge>` should mark the left-hand side of the bin, and
  83         all bins should be of equal width (so we know where the last
  84         one ends).
  85
  86         >>> import StringIO
  87         >>> h = Histogram()
  88         >>> h.from_stream(StringIO.StringIO('''# Force (N)\\tUnfolding events
  89         ... 150e-12\\t10
  90         ... 200e-12\\t40
  91         ... 250e-12\\t5
  92         ... '''))
  93         >>> h.headings
  94         ['Force (N)', 'Unfolding events']
  95         >>> h.total
  96         55.0
  97         >>> h.counts
  98         [10.0, 40.0, 5.0]
  99         >>> h.bin_edges  # doctest: +ELLIPSIS
 100         [1.5e-10, 2.000...e-10, 2.500...e-10, 3e-10]
 101         >>> h.probabilities  # doctest: +ELLIPSIS
 102         [0.181..., 0.727..., 0.0909...]
 103         """
 104         self.headings = None
 105         self.bin_edges = []
 106         self.counts = []
 107         for line in stream.readlines():
 108             line = line.strip()
 109             if len(line) == 0 or line.startswith('#'):
 110                 if self.headings == None and line.startswith('#'):
 111                     line = line[len('#'):]
 112                     self.headings = [x.strip() for x in line.split('\t')]
 113                 continue # ignore blank lines and comments
 114             try:
 115                 bin_edge,count = line.split()
 116             except ValueError:
 117                 log().error('Unable to parse histogram line: "%s"' % line)
 118                 raise
 119             self.bin_edges.append(float(bin_edge))
 120             self.counts.append(float(count))
 121         bin_width = self.bin_edges[1] - self.bin_edges[0]
 122         self.bin_edges.append(self.bin_edges[-1]+bin_width)
 123         self.total = float(sum(self.counts))
 124         self.mean = 0
 125         for bin,count in zip(self.bin_edges, self.counts):
 126             bin += bin_width / 2.0
 127             self.mean += bin * count
 128         self.mean /=  float(self.total)
 129         variance = 0
 130         for bin,count in zip(self.bin_edges, self.counts):
 131             bin += bin_width / 2.0
 132             variance += (bin - self.mean)**2 * count
 133         self.std_dev = numpy.sqrt(variance)
 134         self.normalize()
 135
 136     def to_stream(self, stream):
 137         """Write to `stream` with the same format as `from_stream()`.
 138
 139         >>> import sys
 140         >>> h = Histogram()
 141         >>> h.headings = ['Force (N)', 'Unfolding events']
 142         >>> h.bin_edges = [1.5e-10, 2e-10, 2.5e-10, 3e-10]
 143         >>> h.counts = [10, 40, 5]
 144         >>> h.to_stream(sys.stdout)
 145         ... # doctest: +NORMALIZE_WHITESPACE, +REPORT_UDIFF
 146         #Force (N)\tUnfolding events
 147         1.5e-10\t10
 148         2e-10\t40
 149         2.5e-10\t5
 150         """
 151         if self.headings != None:
 152             stream.write('#%s\n' % '\t'.join(self.headings))
 153         for bin,count in zip(self.bin_edges, self.counts):
 154             stream.write('%g\t%g\n' % (bin, count))
 155
 156     def normalize(self):
 157         self.total = float(self.total)
 158         self.probabilities = [count/self.total for count in self.counts]
 159
 160     def mean_residual(self, other):
 161         return abs(other.mean - self.mean)
 162
 163     def std_dev_residual(self, other):
 164         return abs(other.std_dev - self.std_dev)
 165
 166     def chi_squared_residual(self, other):
 167         assert self.bin_edges == other.bin_edges
 168         residual = 0
 169         for probA,probB in zip(self.probabilities, other.probabilities):
 170             residual += (probA-probB)**2 / probB
 171         return residual
 172
 173     def jensen_shannon_residual(self, other):
 174         assert self.bin_edges == other.bin_edges
 175         def d_KL_term(p,q):
 176             """
 177             Kullback-Leibler divergence for a single bin, with the
 178             exception that we return 0 if q==0, rather than
 179             exploding like d_KL should.  We can do this because
 180             the way d_KL_term is used in the Jensen-Shannon
 181             divergence, if q (really m) == 0, then p also == 0,
 182             and the Jensen-Shannon divergence for the entire term
 183             should be zero.
 184             """
 185             if p==0 or q==0 or p==q:
 186                 return 0.0
 187             return p * numpy.log2(p/q)
 188         residual = 0
 189         for probA,probB in zip(self.probabilities, other.probabilities):
 190             m = (probA+probB) / 2.0
 191             residual += 0.5*(d_KL_term(probA,m) + d_KL_term(probB,m))
 192         return residual
 193
 194     def _method_to_type(self, name):
 195         return name[:-len('_residual')].replace('_', '-')
 196
 197     def _type_to_method(self, name):
 198         return '%s_residual' % name.replace('-', '_')
 199
 200     def types(self):
 201         """Return a list of supported residual types.
 202         """
 203         return sorted([self._method_to_type(x)
 204                        for x in dir(self) if x.endswith('_residual')])
 205
 206     def residual(self, other, type='jensen-shannon'):
 207         """Compare this histogram with `other`.
 208
 209         Supported comparison `type`\s may be found with `types()`:
 210
 211         >>> h = Histogram()
 212         >>> h.types()
 213         ['chi-squared', 'jensen-shannon', 'mean', 'std-dev']
 214
 215         Selecting an invalid `type` raises an exception.
 216
 217         >>> h.residual(other=None, type='invalid-type')
 218         Traceback (most recent call last):
 219           ...
 220         AttributeError: 'Histogram' object has no attribute 'invalid_type_residual'
 221
 222         For residual types where there is a convention, this histogram
 223         is treated as the experimental histogram and the other
 224         histogram is treated as the theoretical one.
 225         """
 226         r_method = getattr(self, self._type_to_method(type))
 227         return r_method(other)