"""Histogram generation and comparison.
"""
+import matplotlib
+matplotlib.use('Agg') # select backend that doesn't require X Windows
import numpy
+import pylab
from . import log
"""Allow nosetests to split tests between processes.
"""
+FIGURE = pylab.figure() # avoid memory problems.
+"""`pylab` keeps internal references to all created figures, so share
+a single instance.
+"""
+
class Histogram (object):
"""A histogram with a flexible comparison method, `residual()`.
>>> h = Histogram()
"""
+ def __init__(self):
+ self.headings = None
+
def calculate_bin_edges(self, data, bin_width):
"""
>>> h = Histogram()
All bins should be of equal width (so we can calculate which
bin a data point belongs to).
-
- `data` should be a numpy array.
"""
- self.headings = None
- self.bin_edges = bin_edges
+ data = numpy.array(data)
+ self.bin_edges = numpy.array(bin_edges)
bin_width = self.bin_edges[1] - self.bin_edges[0]
bin_is = numpy.floor((data - self.bin_edges[0])/bin_width)
- self.counts = []
- for i in range(len(self.bin_edges)-1):
- self.counts.append(sum(bin_is == i).sum())
+ self.counts = numpy.zeros((len(self.bin_edges)-1,), dtype=numpy.int)
+ for i in range(len(self.counts)):
+ self.counts[i] = (bin_is == i).sum()
+ self.counts = numpy.array(self.counts)
self.total = float(len(data)) # some data might be outside the bins
self.mean = data.mean()
self.std_dev = data.std()
>>> h.counts
[10.0, 40.0, 5.0]
>>> h.bin_edges # doctest: +ELLIPSIS
- [1.5e-10, 2.000...e-10, 2.500...e-10, 3e-10]
+ [1.5e-10, 2...e-10, 2.5...e-10, 3e-10]
>>> h.probabilities # doctest: +ELLIPSIS
[0.181..., 0.727..., 0.0909...]
"""
self.counts.append(float(count))
bin_width = self.bin_edges[1] - self.bin_edges[0]
self.bin_edges.append(self.bin_edges[-1]+bin_width)
- self.total = float(sum(self.counts))
- self.mean = 0
- for bin,count in zip(self.bin_edges, self.counts):
- bin += bin_width / 2.0
- self.mean += bin * count
- self.mean /= float(self.total)
- variance = 0
- for bin,count in zip(self.bin_edges, self.counts):
- bin += bin_width / 2.0
- variance += (bin - self.mean)**2 * count
- self.std_dev = numpy.sqrt(variance)
- self.normalize()
+ self.analyze()
def to_stream(self, stream):
"""Write to `stream` with the same format as `from_stream()`.
1.5e-10\t10
2e-10\t40
2.5e-10\t5
+ >>> h.bin_edges = h.bin_edges[:2]
+ >>> h.counts = h.counts[:1]
+ >>> h.to_stream(sys.stdout)
+ ... # doctest: +NORMALIZE_WHITESPACE, +REPORT_UDIFF
+ #Force (N)\tUnfolding events
+ 1.5e-10\t10
+ 2e-10\t0
"""
if self.headings != None:
stream.write('#%s\n' % '\t'.join(self.headings))
for bin,count in zip(self.bin_edges, self.counts):
stream.write('%g\t%g\n' % (bin, count))
+ if len(self.counts) == 1:
+ # print an extra line so that readers can determine bin width
+ stream.write('%g\t0\n' % (self.bin_edges[-1]))
+
+ def analyze(self):
+ """Analyze `.counts` and `.bin_edges` if the raw data is missing.
+
+ Generate `.total`, `.mean`, and `.std_dev`, and run
+ `.normalize()`.
+ """
+ bin_width = self.bin_edges[1] - self.bin_edges[0]
+ self.total = float(sum(self.counts))
+ self.mean = 0
+ for bin,count in zip(self.bin_edges, self.counts):
+ bin += bin_width / 2.0
+ self.mean += bin * count
+ self.mean /= float(self.total)
+ variance = 0
+ for bin,count in zip(self.bin_edges, self.counts):
+ bin += bin_width / 2.0
+ variance += (bin - self.mean)**2 * count
+ self.std_dev = numpy.sqrt(variance)
+ self.normalize()
def normalize(self):
+ """Generate `.probabilities` from `.total` and `.counts`.
+ """
self.total = float(self.total)
self.probabilities = [count/self.total for count in self.counts]
return abs(other.std_dev - self.std_dev)
def chi_squared_residual(self, other):
- assert self.bin_edges == other.bin_edges
+ assert (self.bin_edges == other.bin_edges).all()
residual = 0
for probA,probB in zip(self.probabilities, other.probabilities):
residual += (probA-probB)**2 / probB
return residual
def jensen_shannon_residual(self, other):
- assert self.bin_edges == other.bin_edges
+ assert (self.bin_edges == other.bin_edges).all()
def d_KL_term(p,q):
"""
Kullback-Leibler divergence for a single bin, with the
"""
r_method = getattr(self, self._type_to_method(type))
return r_method(other)
+
+ def plot(self, title=None, filename=None):
+ FIGURE.clear()
+ axes = FIGURE.add_subplot(1, 1, 1)
+ axes.hist(x=self.bin_edges[:-1], # one fake entry for each bin
+ weights=self.counts, # weigh the fake entries by count
+ bins=self.bin_edges,
+ align='mid', histtype='stepfilled')
+ axes.set_title(title)
+ pylab.show()
+ FIGURE.savefig(filename)