1 # Copyright (C) 2009-2010 W. Trevor King <wking@drexel.edu>
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU General Public License as published by
5 # the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16 # The author may be contacted at <wking@drexel.edu> on the Internet, or
17 # write to Trevor King, Drexel University, Physics Dept., 3141 Chestnut St.,
18 # Philadelphia PA 19104, USA.
20 """Histogram generation and comparison.
24 matplotlib.use('Agg') # select backend that doesn't require X Windows
31 _multiprocess_can_split_ = True
32 """Allow nosetests to split tests between processes.
35 FIGURE = pylab.figure() # avoid memory problems.
36 """`pylab` keeps internal references to all created figures, so share
41 class Histogram (object):
42 """A histogram with a flexible comparison method, `residual()`.
49 def calculate_bin_edges(self, data, bin_width):
52 >>> h.calculate_bin_edges(numpy.array([-7.5, 18.2, 4]), 10)
53 array([-10., 0., 10., 20.])
54 >>> h.calculate_bin_edges(numpy.array([-7.5, 18.2, 4, 20]), 10)
55 array([-10., 0., 10., 20.])
56 >>> h.calculate_bin_edges(numpy.array([0, 18.2, 4, 20]), 10)
57 array([ 0., 10., 20.])
58 >>> h.calculate_bin_edges(numpy.array([18.2, 4, 20]), 10)
59 array([ 0., 10., 20.])
60 >>> h.calculate_bin_edges(numpy.array([18.2, 20]), 10)
65 bin_start = m - m % bin_width
66 return numpy.arange(bin_start, M+bin_width, bin_width, dtype=data.dtype)
68 def from_data(self, data, bin_edges):
69 """Initialize from `data`.
71 All bins should be of equal width (so we can calculate which
72 bin a data point belongs to).
74 data = numpy.array(data)
75 self.bin_edges = numpy.array(bin_edges)
76 bin_width = self.bin_edges[1] - self.bin_edges[0]
78 bin_is = numpy.floor((data - self.bin_edges[0])/bin_width)
79 self.counts = numpy.zeros((len(self.bin_edges)-1,), dtype=numpy.int)
80 for i in range(len(self.counts)):
81 self.counts[i] = (bin_is == i).sum()
82 self.counts = numpy.array(self.counts)
83 self.total = float(len(data)) # some data might be outside the bins
84 self.mean = data.mean()
85 self.std_dev = data.std()
88 def from_stream(self, stream):
89 """Initialize from `stream`.
93 # comment and blank lines ignored
94 <bin_edge><whitespace><count>
97 `<bin_edge>` should mark the left-hand side of the bin, and
98 all bins should be of equal width (so we know where the last
103 >>> h.from_stream(StringIO.StringIO('''# Force (N)\\tUnfolding events
109 ['Force (N)', 'Unfolding events']
114 >>> h.bin_edges # doctest: +ELLIPSIS
115 [1.5e-10, 2...e-10, 2.5...e-10, 3e-10]
116 >>> h.probabilities # doctest: +ELLIPSIS
117 [0.181..., 0.727..., 0.0909...]
122 for line in stream.readlines():
124 if len(line) == 0 or line.startswith('#'):
125 if self.headings == None and line.startswith('#'):
126 line = line[len('#'):]
127 self.headings = [x.strip() for x in line.split('\t')]
128 continue # ignore blank lines and comments
130 bin_edge,count = line.split()
132 log().error('Unable to parse histogram line: "%s"' % line)
134 self.bin_edges.append(float(bin_edge))
135 self.counts.append(float(count))
136 bin_width = self.bin_edges[1] - self.bin_edges[0]
137 self.bin_edges.append(self.bin_edges[-1]+bin_width)
140 def to_stream(self, stream):
141 """Write to `stream` with the same format as `from_stream()`.
145 >>> h.headings = ['Force (N)', 'Unfolding events']
146 >>> h.bin_edges = [1.5e-10, 2e-10, 2.5e-10, 3e-10]
147 >>> h.counts = [10, 40, 5]
148 >>> h.to_stream(sys.stdout)
149 ... # doctest: +NORMALIZE_WHITESPACE, +REPORT_UDIFF
150 #Force (N)\tUnfolding events
154 >>> h.bin_edges = h.bin_edges[:2]
155 >>> h.counts = h.counts[:1]
156 >>> h.to_stream(sys.stdout)
157 ... # doctest: +NORMALIZE_WHITESPACE, +REPORT_UDIFF
158 #Force (N)\tUnfolding events
162 if self.headings != None:
163 stream.write('#%s\n' % '\t'.join(self.headings))
164 for bin,count in zip(self.bin_edges, self.counts):
165 stream.write('%g\t%g\n' % (bin, count))
166 if len(self.counts) == 1:
167 # print an extra line so that readers can determine bin width
168 stream.write('%g\t0\n' % (self.bin_edges[-1]))
171 """Analyze `.counts` and `.bin_edges` if the raw data is missing.
173 Generate `.total`, `.mean`, and `.std_dev`, and run
176 bin_width = self.bin_edges[1] - self.bin_edges[0]
177 self.total = float(sum(self.counts))
179 for bin,count in zip(self.bin_edges, self.counts):
180 bin += bin_width / 2.0
181 self.mean += bin * count
182 self.mean /= float(self.total)
184 for bin,count in zip(self.bin_edges, self.counts):
185 bin += bin_width / 2.0
186 variance += (bin - self.mean)**2 * count
187 self.std_dev = numpy.sqrt(variance)
191 """Generate `.probabilities` from `.total` and `.counts`.
193 self.total = float(self.total)
194 self.probabilities = [count/self.total for count in self.counts]
196 def mean_residual(self, other):
197 return abs(other.mean - self.mean)
199 def std_dev_residual(self, other):
200 return abs(other.std_dev - self.std_dev)
202 def chi_squared_residual(self, other):
203 assert (self.bin_edges == other.bin_edges).all()
205 for probA,probB in zip(self.probabilities, other.probabilities):
206 residual += (probA-probB)**2 / probB
209 def jensen_shannon_residual(self, other):
210 assert (self.bin_edges == other.bin_edges).all()
213 Kullback-Leibler divergence for a single bin, with the
214 exception that we return 0 if q==0, rather than
215 exploding like d_KL should. We can do this because
216 the way d_KL_term is used in the Jensen-Shannon
217 divergence, if q (really m) == 0, then p also == 0,
218 and the Jensen-Shannon divergence for the entire term
221 if p==0 or q==0 or p==q:
223 return p * numpy.log2(p/q)
225 for probA,probB in zip(self.probabilities, other.probabilities):
226 m = (probA+probB) / 2.0
227 residual += 0.5*(d_KL_term(probA,m) + d_KL_term(probB,m))
230 def _method_to_type(self, name):
231 return name[:-len('_residual')].replace('_', '-')
233 def _type_to_method(self, name):
234 return '%s_residual' % name.replace('-', '_')
237 """Return a list of supported residual types.
239 return sorted([self._method_to_type(x)
240 for x in dir(self) if x.endswith('_residual')])
242 def residual(self, other, type='jensen-shannon'):
243 """Compare this histogram with `other`.
245 Supported comparison `type`\s may be found with `types()`:
249 ['chi-squared', 'jensen-shannon', 'mean', 'std-dev']
251 Selecting an invalid `type` raises an exception.
253 >>> h.residual(other=None, type='invalid-type')
254 Traceback (most recent call last):
256 AttributeError: 'Histogram' object has no attribute 'invalid_type_residual'
258 For residual types where there is a convention, this histogram
259 is treated as the experimental histogram and the other
260 histogram is treated as the theoretical one.
262 r_method = getattr(self, self._type_to_method(type))
263 return r_method(other)
265 def plot(self, title=None, filename=None):
267 axes = FIGURE.add_subplot(1, 1, 1)
268 axes.hist(x=self.bin_edges[:-1], # one fake entry for each bin
269 weights=self.counts, # weigh the fake entries by count
271 align='mid', histtype='stepfilled')
272 axes.set_title(title)
274 FIGURE.savefig(filename)