Relax precision in pysawsim.histogram.Histogram bin_edges doctest.
[sawsim.git] / pysawsim / histogram.py
index 880fd409ec4f8c1b487cf0b8eba2487f82cc6b8c..acd135955e1223df0a0b0f591ceb8849cee54230 100644 (file)
 
 import numpy
 
+from . import log
+
+
+_multiprocess_can_split_ = True
+"""Allow nosetests to split tests between processes.
+"""
+
 
 class Histogram (object):
     """A histogram with a flexible comparison method, `residual()`.
 
     >>> h = Histogram()
     """
+    def __init__(self):
+        self.headings = None
+
     def calculate_bin_edges(self, data, bin_width):
         """
         >>> h = Histogram()
@@ -52,16 +62,16 @@ class Histogram (object):
 
         All bins should be of equal width (so we can calculate which
         bin a data point belongs to).
-
-        `data` should be a numpy array.
         """
-        self.bin_edges = bin_edges
+        data = numpy.array(data)
+        self.bin_edges = numpy.array(bin_edges)
         bin_width = self.bin_edges[1] - self.bin_edges[0]
 
         bin_is = numpy.floor((data - self.bin_edges[0])/bin_width)
-        self.counts = []
-        for i in range(len(self.bin_edges)-1):
-            self.counts.append(sum(bin_is == i))
+        self.counts = numpy.zeros((len(self.bin_edges)-1,), dtype=numpy.int)
+        for i in range(len(self.counts)):
+            self.counts[i] = (bin_is == i).sum()
+        self.counts = numpy.array(self.counts)
         self.total = float(len(data)) # some data might be outside the bins
         self.mean = data.mean()
         self.std_dev = data.std()
@@ -76,40 +86,76 @@ class Histogram (object):
             <bin_edge><whitespace><count>
             ...
 
-        For example:
-
-
         `<bin_edge>` should mark the left-hand side of the bin, and
         all bins should be of equal width (so we know where the last
         one ends).
 
         >>> import StringIO
         >>> h = Histogram()
-        >>> h.from_stream(StringIO.StringIO('''# Force (N)\tUnfolding events
-        ... 150e-12\t10
-        ... 200e-12\t40
-        ... 250e-12\t5
+        >>> h.from_stream(StringIO.StringIO('''# Force (N)\\tUnfolding events
+        ... 150e-12\\t10
+        ... 200e-12\\t40
+        ... 250e-12\\t5
         ... '''))
+        >>> h.headings
+        ['Force (N)', 'Unfolding events']
         >>> h.total
         55.0
         >>> h.counts
         [10.0, 40.0, 5.0]
         >>> h.bin_edges  # doctest: +ELLIPSIS
-        [1.5e-10, 2.000...e-10, 2.500...e-10, 3e-10]
+        [1.5e-10, 2...e-10, 2.5...e-10, 3e-10]
         >>> h.probabilities  # doctest: +ELLIPSIS
         [0.181..., 0.727..., 0.0909...]
         """
+        self.headings = None
         self.bin_edges = []
         self.counts = []
         for line in stream.readlines():
             line = line.strip()
-            if len(line) == 0 or line[0] == "#":
+            if len(line) == 0 or line.startswith('#'):
+                if self.headings == None and line.startswith('#'):
+                    line = line[len('#'):]
+                    self.headings = [x.strip() for x in line.split('\t')]
                 continue # ignore blank lines and comments
-            bin_edge,count = line.split()
+            try:
+                bin_edge,count = line.split()
+            except ValueError:
+                log().error('Unable to parse histogram line: "%s"' % line)
+                raise
             self.bin_edges.append(float(bin_edge))
             self.counts.append(float(count))
         bin_width = self.bin_edges[1] - self.bin_edges[0]
         self.bin_edges.append(self.bin_edges[-1]+bin_width)
+        self.analyze()
+
+    def to_stream(self, stream):
+        """Write to `stream` with the same format as `from_stream()`.
+
+        >>> import sys
+        >>> h = Histogram()
+        >>> h.headings = ['Force (N)', 'Unfolding events']
+        >>> h.bin_edges = [1.5e-10, 2e-10, 2.5e-10, 3e-10]
+        >>> h.counts = [10, 40, 5]
+        >>> h.to_stream(sys.stdout)
+        ... # doctest: +NORMALIZE_WHITESPACE, +REPORT_UDIFF
+        #Force (N)\tUnfolding events
+        1.5e-10\t10
+        2e-10\t40
+        2.5e-10\t5
+        """
+        if self.headings != None:
+            stream.write('#%s\n' % '\t'.join(self.headings))
+        for bin,count in zip(self.bin_edges, self.counts):
+            stream.write('%g\t%g\n' % (bin, count))
+
+    def analyze(self):
+        """Analyze `.counts` and `.bin_edges` if the raw data is missing.
+
+        Generate `.total`, `.mean`, and `.std_dev`, and run
+        `.normalize()`.
+        """
+        bin_width = self.bin_edges[1] - self.bin_edges[0]
         self.total = float(sum(self.counts))
         self.mean = 0
         for bin,count in zip(self.bin_edges, self.counts):
@@ -124,6 +170,8 @@ class Histogram (object):
         self.normalize()
 
     def normalize(self):
+        """Generate `.probabilities` from `.total` and `.counts`.
+        """
         self.total = float(self.total)
         self.probabilities = [count/self.total for count in self.counts]
 
@@ -134,14 +182,14 @@ class Histogram (object):
         return abs(other.std_dev - self.std_dev)
 
     def chi_squared_residual(self, other):
-        assert self.bin_edges == other.bin_edges
+        assert (self.bin_edges == other.bin_edges).all()
         residual = 0
         for probA,probB in zip(self.probabilities, other.probabilities):
             residual += (probA-probB)**2 / probB
         return residual
 
     def jensen_shannon_residual(self, other):
-        assert self.bin_edges == other.bin_edges
+        assert (self.bin_edges == other.bin_edges).all()
         def d_KL_term(p,q):
             """
             Kullback-Leibler divergence for a single bin, with the