1 # Copyright (C) 2009-2010 W. Trevor King <wking@drexel.edu>
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU General Public License as published by
5 # the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16 # The author may be contacted at <wking@drexel.edu> on the Internet, or
17 # write to Trevor King, Drexel University, Physics Dept., 3141 Chestnut St.,
18 # Philadelphia PA 19104, USA.
20 """Histogram generation and comparison.
26 class Histogram (object):
27 """A histogram with a flexible comparison method, `residual()`.
31 def from_data(self, data, bin_edges):
32 """Initialize from `data`.
34 All bins should be of equal width (so we can calculate which
35 bin a data point belongs to).
37 `data` should be a numpy array.
39 self.bin_edges = bin_edges
40 bin_width = self.bin_edges[1] - self.bin_edges[0]
42 bin_is = numpy.floor((data - self.bin_edges[0])/bin_width)
44 for i in range(len(self.bin_edges)-1):
45 self.counts.append(sum(bin_is == i))
46 self.total = float(len(data)) # some data might be outside the bins
47 self.mean = data.mean()
48 self.std_dev = data.std()
51 def from_stream(self, stream):
52 """Initialize from `stream`.
56 # comment and blank lines ignored
57 <bin_edge><whitespace><count>
63 `<bin_edge>` should mark the left-hand side of the bin, and
64 all bins should be of equal width (so we know where the last
69 >>> h.from_stream(StringIO.StringIO('''# Force (N)\tUnfolding events
78 >>> h.bin_edges # doctest: +ELLIPSIS
79 [1.5e-10, 2.000...e-10, 2.500...e-10, 3e-10]
80 >>> h.probabilities # doctest: +ELLIPSIS
81 [0.181..., 0.727..., 0.0909...]
85 for line in stream.readlines():
87 if len(line) == 0 or line[0] == "#":
88 continue # ignore blank lines and comments
89 bin_edge,count = line.split()
90 self.bin_edges.append(float(bin_edge))
91 self.counts.append(float(count))
92 bin_width = self.bin_edges[1] - self.bin_edges[0]
93 self.bin_edges.append(self.bin_edges[-1]+bin_width)
94 self.total = float(sum(self.counts))
96 for bin,count in zip(self.bin_edges, self.counts):
97 bin += bin_width / 2.0
98 self.mean += bin * count
99 self.mean /= float(self.total)
101 for bin,count in zip(self.bin_edges, self.counts):
102 bin += bin_width / 2.0
103 variance += (bin - self.mean)**2 * count
104 self.std_dev = numpy.sqrt(variance)
108 self.total = float(self.total)
109 self.probabilities = [count/self.total for count in self.counts]
111 def mean_residual(self, other):
112 return abs(other.mean - self.mean)
114 def std_dev_residual(self, other):
115 return abs(other.std_dev - self.std_dev)
117 def chi_squared_residual(self, other):
118 assert self.bin_edges == other.bin_edges
120 for probA,probB in zip(self.probabilities, other.probabilities):
121 residual += (probA-probB)**2 / probB
124 def jensen_shannon_residual(self, other):
125 assert self.bin_edges == other.bin_edges
128 Kullback-Leibler divergence for a single bin, with the
129 exception that we return 0 if q==0, rather than
130 exploding like d_KL should. We can do this because
131 the way d_KL_term is used in the Jensen-Shannon
132 divergence, if q (really m) == 0, then p also == 0,
133 and the Jensen-Shannon divergence for the entire term
136 if p==0 or q==0 or p==q:
138 return p * numpy.log2(p/q)
140 for probA,probB in zip(self.probabilities, other.probabilities):
141 m = (probA+probB) / 2.0
142 residual += 0.5*(d_KL_term(probA,m) + d_KL_term(probB,m))
145 def _method_to_type(self, name):
146 return name[:-len('_residual')].replace('_', '-')
148 def _type_to_method(self, name):
149 return '%s_residual' % name.replace('-', '_')
152 """Return a list of supported residual types.
154 return sorted([self._method_to_type(x)
155 for x in dir(self) if x.endswith('_residual')])
157 def residual(self, other, type='jensen-shannon'):
158 """Compare this histogram with `other`.
160 Supported comparison `type`\s may be found with `types()`:
164 ['chi-squared', 'jensen-shannon', 'mean', 'std-dev']
166 Selecting an invalid `type` raises an exception.
168 >>> h.residual(other=None, type='invalid-type')
169 Traceback (most recent call last):
171 AttributeError: 'Histogram' object has no attribute 'invalid_type_residual'
173 For residual types where there is a convention, this histogram
174 is treated as the experimental histogram and the other
175 histogram is treated as the theoretical one.
177 r_method = getattr(self, self._type_to_method(type))
178 return r_method(other)