Relax precision in pysawsim.histogram.Histogram bin_edges doctest.

[sawsim.git] / pysawsim / histogram.py
diff --git a/pysawsim/histogram.py b/pysawsim/histogram.py

index 880fd409ec4f8c1b487cf0b8eba2487f82cc6b8c..acd135955e1223df0a0b0f591ceb8849cee54230 100644 (file)
--- a/pysawsim/histogram.py
+++ b/pysawsim/histogram.py
@@ -22,12 +22,22 @@
  
  import numpy
  
+from . import log
+
+
+_multiprocess_can_split_ = True
+"""Allow nosetests to split tests between processes.
+"""
+
  
  class Histogram (object):
      """A histogram with a flexible comparison method, `residual()`.
  
      >>> h = Histogram()
      """
+    def __init__(self):
+        self.headings = None
+
      def calculate_bin_edges(self, data, bin_width):
          """
          >>> h = Histogram()
@@ -52,16 +62,16 @@ class Histogram (object):
  
          All bins should be of equal width (so we can calculate which
          bin a data point belongs to).
-
-        `data` should be a numpy array.
          """
-        self.bin_edges = bin_edges
+        data = numpy.array(data)
+        self.bin_edges = numpy.array(bin_edges)
          bin_width = self.bin_edges[1] - self.bin_edges[0]
  
          bin_is = numpy.floor((data - self.bin_edges[0])/bin_width)
-        self.counts = []
-        for i in range(len(self.bin_edges)-1):
-            self.counts.append(sum(bin_is == i))
+        self.counts = numpy.zeros((len(self.bin_edges)-1,), dtype=numpy.int)
+        for i in range(len(self.counts)):
+            self.counts[i] = (bin_is == i).sum()
+        self.counts = numpy.array(self.counts)
          self.total = float(len(data)) # some data might be outside the bins
          self.mean = data.mean()
          self.std_dev = data.std()
@@ -76,40 +86,76 @@ class Histogram (object):
              <bin_edge><whitespace><count>
              ...
  
-        For example:
-
-
          `<bin_edge>` should mark the left-hand side of the bin, and
          all bins should be of equal width (so we know where the last
          one ends).
  
          >>> import StringIO
          >>> h = Histogram()
-        >>> h.from_stream(StringIO.StringIO('''# Force (N)\tUnfolding events
-        ... 150e-12\t10
-        ... 200e-12\t40
-        ... 250e-12\t5
+        >>> h.from_stream(StringIO.StringIO('''# Force (N)\\tUnfolding events
+        ... 150e-12\\t10
+        ... 200e-12\\t40
+        ... 250e-12\\t5
          ... '''))
+        >>> h.headings
+        ['Force (N)', 'Unfolding events']
          >>> h.total
          55.0
          >>> h.counts
          [10.0, 40.0, 5.0]
          >>> h.bin_edges  # doctest: +ELLIPSIS
-        [1.5e-10, 2.000...e-10, 2.500...e-10, 3e-10]
+        [1.5e-10, 2...e-10, 2.5...e-10, 3e-10]
          >>> h.probabilities  # doctest: +ELLIPSIS
          [0.181..., 0.727..., 0.0909...]
          """
+        self.headings = None
          self.bin_edges = []
          self.counts = []
          for line in stream.readlines():
              line = line.strip()
-            if len(line) == 0 or line[0] == "#":
+            if len(line) == 0 or line.startswith('#'):
+                if self.headings == None and line.startswith('#'):
+                    line = line[len('#'):]
+                    self.headings = [x.strip() for x in line.split('\t')]
                  continue # ignore blank lines and comments
-            bin_edge,count = line.split()
+            try:
+                bin_edge,count = line.split()
+            except ValueError:
+                log().error('Unable to parse histogram line: "%s"' % line)
+                raise
              self.bin_edges.append(float(bin_edge))
              self.counts.append(float(count))
          bin_width = self.bin_edges[1] - self.bin_edges[0]
          self.bin_edges.append(self.bin_edges[-1]+bin_width)
+        self.analyze()
+
+    def to_stream(self, stream):
+        """Write to `stream` with the same format as `from_stream()`.
+
+        >>> import sys
+        >>> h = Histogram()
+        >>> h.headings = ['Force (N)', 'Unfolding events']
+        >>> h.bin_edges = [1.5e-10, 2e-10, 2.5e-10, 3e-10]
+        >>> h.counts = [10, 40, 5]
+        >>> h.to_stream(sys.stdout)
+        ... # doctest: +NORMALIZE_WHITESPACE, +REPORT_UDIFF
+        #Force (N)\tUnfolding events
+        1.5e-10\t10
+        2e-10\t40
+        2.5e-10\t5
+        """
+        if self.headings != None:
+            stream.write('#%s\n' % '\t'.join(self.headings))
+        for bin,count in zip(self.bin_edges, self.counts):
+            stream.write('%g\t%g\n' % (bin, count))
+
+    def analyze(self):
+        """Analyze `.counts` and `.bin_edges` if the raw data is missing.
+
+        Generate `.total`, `.mean`, and `.std_dev`, and run
+        `.normalize()`.
+        """
+        bin_width = self.bin_edges[1] - self.bin_edges[0]
          self.total = float(sum(self.counts))
          self.mean = 0
          for bin,count in zip(self.bin_edges, self.counts):
@@ -124,6 +170,8 @@ class Histogram (object):
          self.normalize()
  
      def normalize(self):
+        """Generate `.probabilities` from `.total` and `.counts`.
+        """
          self.total = float(self.total)
          self.probabilities = [count/self.total for count in self.counts]
  
@@ -134,14 +182,14 @@ class Histogram (object):
          return abs(other.std_dev - self.std_dev)
  
      def chi_squared_residual(self, other):
-        assert self.bin_edges == other.bin_edges
+        assert (self.bin_edges == other.bin_edges).all()
          residual = 0
          for probA,probB in zip(self.probabilities, other.probabilities):
              residual += (probA-probB)**2 / probB
          return residual
  
      def jensen_shannon_residual(self, other):
-        assert self.bin_edges == other.bin_edges
+        assert (self.bin_edges == other.bin_edges).all()
          def d_KL_term(p,q):
              """
              Kullback-Leibler divergence for a single bin, with the