Merge commit 'origin/master'
[sawsim.git] / pysawsim / histogram.py
index bb66c05f2bb0c3eebd74cb40b67ae1bafa21ebfb..7591e56ef56c02dc693ead4a6f441457b96e054d 100644 (file)
@@ -35,6 +35,9 @@ class Histogram (object):
 
     >>> h = Histogram()
     """
+    def __init__(self):
+        self.headings = None
+
     def calculate_bin_edges(self, data, bin_width):
         """
         >>> h = Histogram()
@@ -59,17 +62,16 @@ class Histogram (object):
 
         All bins should be of equal width (so we can calculate which
         bin a data point belongs to).
-
-        `data` should be a numpy array.
         """
-        self.headings = None
-        self.bin_edges = bin_edges
+        data = numpy.array(data)
+        self.bin_edges = numpy.array(bin_edges)
         bin_width = self.bin_edges[1] - self.bin_edges[0]
 
         bin_is = numpy.floor((data - self.bin_edges[0])/bin_width)
-        self.counts = []
-        for i in range(len(self.bin_edges)-1):
-            self.counts.append(sum(bin_is == i).sum())
+        self.counts = numpy.zeros((len(self.bin_edges)-1,), dtype=numpy.int)
+        for i in range(len(self.counts)):
+            self.counts[i] = (bin_is == i).sum()
+        self.counts = numpy.array(self.counts)
         self.total = float(len(data)) # some data might be outside the bins
         self.mean = data.mean()
         self.std_dev = data.std()
@@ -125,18 +127,7 @@ class Histogram (object):
             self.counts.append(float(count))
         bin_width = self.bin_edges[1] - self.bin_edges[0]
         self.bin_edges.append(self.bin_edges[-1]+bin_width)
-        self.total = float(sum(self.counts))
-        self.mean = 0
-        for bin,count in zip(self.bin_edges, self.counts):
-            bin += bin_width / 2.0
-            self.mean += bin * count
-        self.mean /=  float(self.total)
-        variance = 0
-        for bin,count in zip(self.bin_edges, self.counts):
-            bin += bin_width / 2.0
-            variance += (bin - self.mean)**2 * count
-        self.std_dev = numpy.sqrt(variance)
-        self.normalize()
+        self.analyze()
 
     def to_stream(self, stream):
         """Write to `stream` with the same format as `from_stream()`.
@@ -158,7 +149,29 @@ class Histogram (object):
         for bin,count in zip(self.bin_edges, self.counts):
             stream.write('%g\t%g\n' % (bin, count))
 
+    def analyze(self):
+        """Analyze `.counts` and `.bin_edges` if the raw data is missing.
+
+        Generate `.total`, `.mean`, and `.std_dev`, and run
+        `.normalize()`.
+        """
+        bin_width = self.bin_edges[1] - self.bin_edges[0]
+        self.total = float(sum(self.counts))
+        self.mean = 0
+        for bin,count in zip(self.bin_edges, self.counts):
+            bin += bin_width / 2.0
+            self.mean += bin * count
+        self.mean /=  float(self.total)
+        variance = 0
+        for bin,count in zip(self.bin_edges, self.counts):
+            bin += bin_width / 2.0
+            variance += (bin - self.mean)**2 * count
+        self.std_dev = numpy.sqrt(variance)
+        self.normalize()
+
     def normalize(self):
+        """Generate `.probabilities` from `.total` and `.counts`.
+        """
         self.total = float(self.total)
         self.probabilities = [count/self.total for count in self.counts]
 
@@ -169,14 +182,14 @@ class Histogram (object):
         return abs(other.std_dev - self.std_dev)
 
     def chi_squared_residual(self, other):
-        assert self.bin_edges == other.bin_edges
+        assert (self.bin_edges == other.bin_edges).all()
         residual = 0
         for probA,probB in zip(self.probabilities, other.probabilities):
             residual += (probA-probB)**2 / probB
         return residual
 
     def jensen_shannon_residual(self, other):
-        assert self.bin_edges == other.bin_edges
+        assert (self.bin_edges == other.bin_edges).all()
         def d_KL_term(p,q):
             """
             Kullback-Leibler divergence for a single bin, with the