Ensure that printed histograms always have at least two bins listed.

[sawsim.git] / pysawsim / histogram.py
diff --git a/pysawsim/histogram.py b/pysawsim/histogram.py

index bb66c05f2bb0c3eebd74cb40b67ae1bafa21ebfb..6c326d2ef6cb53e6efd83fffbf338dfff565f860 100644 (file)
--- a/pysawsim/histogram.py
+++ b/pysawsim/histogram.py
@@ -20,7 +20,10 @@
  """Histogram generation and comparison.
  """
  
+import matplotlib
+matplotlib.use('Agg')  # select backend that doesn't require X Windows
  import numpy
+import pylab
  
  from . import log
  
@@ -29,12 +32,20 @@ _multiprocess_can_split_ = True
  """Allow nosetests to split tests between processes.
  """
  
+FIGURE = pylab.figure()  # avoid memory problems.
+"""`pylab` keeps internal references to all created figures, so share
+a single instance.
+"""
+
  
  class Histogram (object):
      """A histogram with a flexible comparison method, `residual()`.
  
      >>> h = Histogram()
      """
+    def __init__(self):
+        self.headings = None
+
      def calculate_bin_edges(self, data, bin_width):
          """
          >>> h = Histogram()
@@ -59,17 +70,16 @@ class Histogram (object):
  
          All bins should be of equal width (so we can calculate which
          bin a data point belongs to).
-
-        `data` should be a numpy array.
          """
-        self.headings = None
-        self.bin_edges = bin_edges
+        data = numpy.array(data)
+        self.bin_edges = numpy.array(bin_edges)
          bin_width = self.bin_edges[1] - self.bin_edges[0]
  
          bin_is = numpy.floor((data - self.bin_edges[0])/bin_width)
-        self.counts = []
-        for i in range(len(self.bin_edges)-1):
-            self.counts.append(sum(bin_is == i).sum())
+        self.counts = numpy.zeros((len(self.bin_edges)-1,), dtype=numpy.int)
+        for i in range(len(self.counts)):
+            self.counts[i] = (bin_is == i).sum()
+        self.counts = numpy.array(self.counts)
          self.total = float(len(data)) # some data might be outside the bins
          self.mean = data.mean()
          self.std_dev = data.std()
@@ -102,7 +112,7 @@ class Histogram (object):
          >>> h.counts
          [10.0, 40.0, 5.0]
          >>> h.bin_edges  # doctest: +ELLIPSIS
-        [1.5e-10, 2.000...e-10, 2.500...e-10, 3e-10]
+        [1.5e-10, 2...e-10, 2.5...e-10, 3e-10]
          >>> h.probabilities  # doctest: +ELLIPSIS
          [0.181..., 0.727..., 0.0909...]
          """
@@ -125,18 +135,7 @@ class Histogram (object):
              self.counts.append(float(count))
          bin_width = self.bin_edges[1] - self.bin_edges[0]
          self.bin_edges.append(self.bin_edges[-1]+bin_width)
-        self.total = float(sum(self.counts))
-        self.mean = 0
-        for bin,count in zip(self.bin_edges, self.counts):
-            bin += bin_width / 2.0
-            self.mean += bin * count
-        self.mean /=  float(self.total)
-        variance = 0
-        for bin,count in zip(self.bin_edges, self.counts):
-            bin += bin_width / 2.0
-            variance += (bin - self.mean)**2 * count
-        self.std_dev = numpy.sqrt(variance)
-        self.normalize()
+        self.analyze()
  
      def to_stream(self, stream):
          """Write to `stream` with the same format as `from_stream()`.
@@ -152,13 +151,45 @@ class Histogram (object):
          1.5e-10\t10
          2e-10\t40
          2.5e-10\t5
+        >>> h.bin_edges = h.bin_edges[:2]
+        >>> h.counts = h.counts[:1]
+        >>> h.to_stream(sys.stdout)
+        ... # doctest: +NORMALIZE_WHITESPACE, +REPORT_UDIFF
+        #Force (N)\tUnfolding events
+        1.5e-10\t10
+        2e-10\t0
          """
          if self.headings != None:
              stream.write('#%s\n' % '\t'.join(self.headings))
          for bin,count in zip(self.bin_edges, self.counts):
              stream.write('%g\t%g\n' % (bin, count))
+        if len(self.counts) == 1:
+            # print an extra line so that readers can determine bin width
+            stream.write('%g\t0\n' % (self.bin_edges[-1]))
+
+    def analyze(self):
+        """Analyze `.counts` and `.bin_edges` if the raw data is missing.
+
+        Generate `.total`, `.mean`, and `.std_dev`, and run
+        `.normalize()`.
+        """
+        bin_width = self.bin_edges[1] - self.bin_edges[0]
+        self.total = float(sum(self.counts))
+        self.mean = 0
+        for bin,count in zip(self.bin_edges, self.counts):
+            bin += bin_width / 2.0
+            self.mean += bin * count
+        self.mean /=  float(self.total)
+        variance = 0
+        for bin,count in zip(self.bin_edges, self.counts):
+            bin += bin_width / 2.0
+            variance += (bin - self.mean)**2 * count
+        self.std_dev = numpy.sqrt(variance)
+        self.normalize()
  
      def normalize(self):
+        """Generate `.probabilities` from `.total` and `.counts`.
+        """
          self.total = float(self.total)
          self.probabilities = [count/self.total for count in self.counts]
  
@@ -169,14 +200,14 @@ class Histogram (object):
          return abs(other.std_dev - self.std_dev)
  
      def chi_squared_residual(self, other):
-        assert self.bin_edges == other.bin_edges
+        assert (self.bin_edges == other.bin_edges).all()
          residual = 0
          for probA,probB in zip(self.probabilities, other.probabilities):
              residual += (probA-probB)**2 / probB
          return residual
  
      def jensen_shannon_residual(self, other):
-        assert self.bin_edges == other.bin_edges
+        assert (self.bin_edges == other.bin_edges).all()
          def d_KL_term(p,q):
              """
              Kullback-Leibler divergence for a single bin, with the
@@ -230,3 +261,14 @@ class Histogram (object):
          """
          r_method = getattr(self, self._type_to_method(type))
          return r_method(other)
+
+    def plot(self, title=None, filename=None):
+        FIGURE.clear()
+        axes = FIGURE.add_subplot(1, 1, 1)
+        axes.hist(x=self.bin_edges[:-1],  # one fake entry for each bin
+                  weights=self.counts,    # weigh the fake entries by count
+                  bins=self.bin_edges,
+                  align='mid', histtype='stepfilled')
+        axes.set_title(title)
+        pylab.show()
+        FIGURE.savefig(filename)