From c35049bf6656fc97d1c9d947ebb8fdd9e0b34256 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Thu, 4 Nov 2010 14:53:50 -0400 Subject: [PATCH] Moved testing/common/fit.py to pysawsim/fit.py and cleaned up. --- pysawsim/constants.py | 28 ++++ pysawsim/fit.py | 375 ++++++++++++++++++++++++++++++++++++++++++ testing/common/fit.py | 89 ---------- 3 files changed, 403 insertions(+), 89 deletions(-) create mode 100644 pysawsim/constants.py create mode 100755 pysawsim/fit.py delete mode 100755 testing/common/fit.py diff --git a/pysawsim/constants.py b/pysawsim/constants.py new file mode 100644 index 0000000..fc00ce3 --- /dev/null +++ b/pysawsim/constants.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2008-2010 W. Trevor King +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# The author may be contacted at on the Internet, or +# write to Trevor King, Drexel University, Physics Dept., 3141 Chestnut St., +# Philadelphia PA 19104, USA. + +"""Define useful physical constants. + +TODO: bundle physcon? +""" + + +kB = 1.3806503e-23 # m²·kg/s²·K + diff --git a/pysawsim/fit.py b/pysawsim/fit.py new file mode 100755 index 0000000..2d35cf8 --- /dev/null +++ b/pysawsim/fit.py @@ -0,0 +1,375 @@ +# Copyright (C) 2008-2010 W. Trevor King +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# The author may be contacted at on the Internet, or +# write to Trevor King, Drexel University, Physics Dept., 3141 Chestnut St., +# Philadelphia PA 19104, USA. + +"""Provide :class:`ModelFitter` to make arbitrary model fitting easy. +""" + +from copy import deepcopy + +from numpy import arange, array, ndarray, exp, pi, sqrt +from scipy.optimize import leastsq + + +class ModelFitter (object): + """A convenient wrapper around :func:`scipy.optimize.leastsq`. + + For minimizing fit error: returns the normalized rms difference + between a dataset and a function `y(X)`. In order to use this + class, you must sublcass it and overwrite `ModelFitter.model()`. + It is recommended that you also overwrite + `ModelFitter.guess_initial_params()`, and then not pass + initializing params to `ModelFitter.model()`. This helps ensure + good automation for unsupervised testing. Developing good + heuristics for `ModelFitter.guess_initial_params()` is usually the + hardest part writing a theoretical histogram test. + + For bonus points, you can override `ModelFitter.print_model()` and + `ModelFitter.print_params()`, to give your users an easy to + understand idea of what's going on. It's better to be verbose and + clear. Remember, this is the testing code. It needs to be solid. + Save your wonderful, sneaky hacks for the main code ;). + + TODO: look into :mod:`scipy.odr` as an alternative fitting + algorithm (minimizing the sum of squares of orthogonal distances, + vs. minimizing y distances). + + Parameters + ---------- + d_data : array_like + Deflection data to be analyzed for the contact position. + info : + Store any extra information useful inside your overridden + methods. + rescale : boolean + Rescale parameters so the guess for each is 1.0. Also rescale + the data so data.std() == 1.0. + + Examples + -------- + >>> from pprint import pprint + >>> import numpy + >>> class LinearModel (ModelFitter): + ... '''Simple linear model. + ... + ... Levenberg-Marquardt is not how you want to solve this problem + ... for real systems, but it's a simple test case. + ... ''' + ... def model(self, params): + ... '''A linear model. + ... + ... Notes + ... ----- + ... .. math:: y = p_0 x + p_1 + ... ''' + ... p = params # convenient alias + ... self._model_data[:] = p[0]*arange(len(self._data)) + p[1] + ... return self._model_data + ... def guess_initial_params(self): + ... return [float(self._data[-1] - self._data[0])/len(self._data), + ... self._data[0]] + ... def guess_scale(self, params): + ... slope_scale = 0.1 + ... if params[1] == 0: + ... offset_scale = 1 + ... else: + ... offset_scale = 0.1*self._data.std()/abs(params[1]) + ... if offset_scale == 0: # data is completely flat + ... offset_scale = 1. + ... return [slope_scale, offset_scale] + ... def model_string(self): + ... return 'y = A x + B' + ... def param_string(self, params): + ... pstrings = [] + ... for name,param in zip(['A', 'B'], params): + ... pstrings.append('%s=%.3f' % (name, param)) + ... return ', '.join(pstrings) + >>> data = 20*numpy.sin(arange(1000)) + 7.*arange(1000) - 33.0 + >>> m = LinearModel(data) + >>> slope,offset = m.fit() + >>> pprint(m.fit_info) + ... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE, +REPORT_UDIFF + {'active fitted parameters': array([ 6.999..., -32.889...]), + 'active parameters': array([ 6.999..., -32.889...]), + 'convergence flag': ..., + 'covariance matrix': array([[ 1.199...e-08, -5.993...e-06], + [ -5.993...e-06, 3.994...e-03]]), + 'data scale factor': 1.0, + 'fitted parameters': array([ 6.999..., -32.889...]), + 'info': {'fjac': array([[...]]), + 'fvec': array([...]), + 'ipvt': array([1, 2]), + 'nfev': 7, + 'qtf': array([...])}, + 'initial parameters': [6.992..., -33.0], + 'message': '...relative error between two consecutive iterates is at most 0.000...', + 'rescaled': False, + 'scale': [0.100..., 6.123...]} + + We round the outputs to protect the doctest against differences in + machine rounding during computation. We expect the values to be close + to the input settings (slope 7, offset -33). + + >>> print '%.3f' % slope + 7.000 + >>> print '%.3f' % offset + -32.890 + + >>> print m.model_string() + y = A x + B + >>> print m.param_string([slope,offset]) + A=7.000, B=-32.890 + >>> print ModelFitter.param_string(m, [slope,offset]) # doctest: +ELLIPSIS + param_0=6.9..., param_1=-32.8... + + The offset is a bit off because, the range is not a multiple of + :math:`2\pi`. + + We could also use rescaled parameters: + + >>> m = LinearModel(data, rescale=True) + >>> slope,offset = m.fit() + >>> print '%.3f' % slope + 7.000 + >>> print '%.3f' % offset + -32.890 + """ + def __init__(self, *args, **kwargs): + self.set_data(*args, **kwargs) + + def set_data(self, data, info=None, rescale=False): + self._data = data + self._model_data = ndarray(shape=data.shape, dtype=data.dtype) + self.info = info + self._rescale = rescale + if rescale == True: + for x in [data.std(), data.max()-data.min(), abs(data.max()), 1.0]: + if x != 0: + self._data_scale_factor = x + break + else: + self._data_scale_factor = 1.0 + + def model(self, params): + p = params # convenient alias + self._model_data[:] = arange(len(self._data)) + raise NotImplementedError + + def model_string(self): + """OVERRIDE: Give the user some idea of what model we're using. + """ + return 'model string not defined' + + def param_string(self, params): + """OVERRIDE: Give the user nicely named format for a parameter vector. + """ + pstrings = [] + for i,p in enumerate(params): + pstrings.append('param_%d=%g' % (i,p)) + return ', '.join(pstrings) + + def guess_initial_params(self): + """OVERRIDE: Provide some intelligent heuristic to pick a + starting point for minimizing your model""" + raise NotImplementedError + + def guess_scale(self, params): + """Guess the problem length scale in each parameter dimension. + + Notes + ----- + From the :func:`scipy.optimize.leastsq` documentation, `diag` + (which we refer to as `scale`, sets `factor * || diag * x||` + as the initial step. If `x == 0`, then `factor` is used + instead (from `lmdif.f`_):: + + do 70 j = 1, n + wa3(j) = diag(j)*x(j) + 70 continue + xnorm = enorm(n,wa3) + delta = factor*xnorm + if (delta .eq. zero) delta = factor + + For most situations then, you don't need to do anything fancy. + The default scaling (if you don't set a scale) is:: + + c on the first iteration and if mode is 1, scale according + c to the norms of the columns of the initial jacobian. + + (i.e. `diag(j) = acnorm(j)`, where `acnorm(j) is the norm of the `j`th column + of the initial Jacobian). + + .. _lmdif.f: http://www.netlib.org/minpack/lmdif.f + """ + return None + + def residual(self, params): + if self._rescale == True: + params = [p*s for p,s in zip(params, self._param_scale_factors)] + residual = self._data - self.model(params) + if False: # fit debugging + if not hasattr(self, '_i_'): + self._i_ = 0 + self._data.tofile('data.%d' % self._i_, sep='\n') + self._model_data.tofile('model.%d' % self._i_, sep='\n') + self._i_ += 1 + if self._rescale == True: + residual /= self._data_scale_factor + return residual + + def fit(self, initial_params=None, scale=None, **kwargs): + """ + Parameters + ---------- + initial_params : iterable or None + Initial parameter values for residual minimization. If + `None`, they are estimated from the data using + :meth:`guess_initial_params`. + scale : iterable or None + Parameter length scales for residual minimization. If + `None`, they are estimated from the data using + :meth:`guess_scale`. + kwargs : + Any additional arguments are passed through to `leastsq`. + """ + if initial_params == None: + initial_params = self.guess_initial_params() + if scale == None: + scale = self.guess_scale(initial_params) + if scale != None: + assert min(scale) > 0, scale + if self._rescale == True: + self._param_scale_factors = initial_params + for i,s in enumerate(self._param_scale_factors): + if s == 0: + self._param_scale_factors[i] = 1.0 + active_params = [p/s for p,s in zip(initial_params, + self._param_scale_factors)] + else: + active_params = initial_params + params,cov,info,mesg,ier = leastsq( + func=self.residual, x0=active_params, full_output=True, + diag=scale, **kwargs) + if self._rescale == True: + active_params = params + if len(initial_params) == 1: # params is a float + params = params * self._param_scale_factors[0] + else: + params = [p*s for p,s in zip(params, + self._param_scale_factors)] + else: + active_params = params + self.fit_info = { + 'rescaled': self._rescale, + 'initial parameters': initial_params, + 'active parameters': active_params, + 'scale': scale, + 'data scale factor': self._data_scale_factor, + 'active fitted parameters': active_params, + 'fitted parameters': params, + 'covariance matrix': cov, + 'info': info, + 'message': mesg, + 'convergence flag': ier, + } + return params + + +class HistogramModelFitter (ModelFitter): + """Adapt ModelFitter for easy Histogram() comparison. + + TODO: look into :func:`scipy.optimize.fmin_powell` as way to + minimize a single residul number. Useful for fitting `Histograms` + with their assorted residal methods. + + Examples + -------- + >>> from random import gauss + >>> from numpy import array + >>> from .histogram import Histogram + + >>> class GaussianModel (HistogramModelFitter): + ... '''Simple gaussian model. + ... ''' + ... def model(self, params): + ... '''A gaussian model. + ... + ... Notes + ... ----- + ... .. math:: y \\propto e^{-\\frac{(x-p_0)^2}{2 p_1^2}} + ... ''' + ... p = params # convenient alias + ... p[1] = abs(p[1]) # cannot have negative std. dev. + ... self._model_data.counts = self.info['N']/(p[1]*sqrt(2*pi)) * ( + ... exp(-((self._model_data.bin_centers - p[0])/p[1])**2 / 2)) + ... return self._model_data + ... def guess_initial_params(self): + ... return [self._data.mean, self._data.std_dev] + ... def guess_scale(self, params): + ... return [self._data.std_dev, self._data.std_dev] + ... def model_string(self): + ... return 'p(x) = A exp((x-mu)^2 / (2 sigma^2))' + ... def param_string(self, params): + ... pstrings = [] + ... for name,param in zip(['mu', 'sigma'], params): + ... pstrings.append('%s=%.3f' % (name, param)) + ... return ', '.join(pstrings) + + >>> mu = 1.45 + >>> sigma = 3.14 + >>> data = array([gauss(mu,sigma) for i in range(int(1e4))]) + >>> h = Histogram() + >>> h.from_data(data, h.calculate_bin_edges(data, 1)) + >>> m = GaussianModel(h) + >>> params = m.fit() + >>> print m.model_string() + p(x) = A exp((x-mu)^2 / (2 sigma^2)) + >>> print m.param_string(params) # doctest: +ELLIPSIS + mu=1.4..., sigma=3.1... + """ + def set_data(self, data, info=None, rescale=False): + self._data = data # Histogram() instance + self._model_data = deepcopy(data) + self._model_data.bin_edges = array(self._model_data.bin_edges) + self._model_data.counts = array(self._model_data.counts) + if info == None: + info = {} + self.info = info + self._rescale = rescale + self._data_scale_factor = 1.0 # bo need to rescale normalized hists. + self.info['N'] = self._data.total + # assume constant binwidth + self.info['binwidth'] = ( + self._data.bin_edges[1] - self._data.bin_edges[0]) + self._model_data.bin_centers = ( + self._model_data.bin_edges[:-1] + self.info['binwidth']/2) + + def residual(self, params): + if self._rescale == True: + params = [p*s for p,s in zip(params, self._param_scale_factors)] + residual = self._data.counts - self.model(params).counts + if True: # fit debugging + if not hasattr(self, '_i_'): + self._i_ = 0 + self._data.to_stream(open('hist-data.%d' % self._i_, 'w')) + self._model_data.headings = [str(x) for x in params] + self._model_data.to_stream(open('hist-model.%d' % self._i_, 'w')) + self._i_ += 1 + if self._rescale == True: + residual /= self._data_scale_factor + return residual diff --git a/testing/common/fit.py b/testing/common/fit.py deleted file mode 100755 index 1f0aa12..0000000 --- a/testing/common/fit.py +++ /dev/null @@ -1,89 +0,0 @@ -# -*- coding: utf-8 -*- - -"""Fit histogram data coming in on stdin to a user-defined model. You -should write a python script sublcassing fit-hist.Model.""" - -import scipy.optimize, scipy -import sys - -kB=1.3806503e-23 # m²·kg/s²·K - -class Model (object) : - - """For minimizing fit error: returns the normalized rms difference - between a dataset and a function y(X). In order to use this - class, you must sublcass it and overwrite Model.model(x,params). - It is recommended that you also overwrite - Model.guess_initial_params(), and then not pass initializing - params to Model.model(). This helps ensure good automation for - unsupervised testing. Developing good heuristics for - Model.guessInitialParams() is usually the hardest part writing a - theoretical histogram test. - - For bonus points, you can override Model.printModel() and - Model.printParams(), to give your users an easy to understand idea - of what's going on. It's better to be verbose and clear. - Remember, this is the testing branch. It needs to be solid. Save - your wonderful, sneaky hacks for the main code ;). - - """ - def __init__(self, numParams, data) : - """Initialize the Model instance with a set of data, for - example: data = [(0,4),(1,3),(2,3),(4,0)]""" - self.data = data # an iterable filled with (x,y) pairs - # N (# unfolding events) = the sum of the y values (# hits per bin) - self.N = sum(zip(*data)[1]) # zip(* takes data to [[x0,x1,..],[y..]] - # assumes constant binwidth - self.binwidth = self.data[1][0] - self.data[0][0] - self.numParams = numParams - def nrms(self, params) : - """Returns the normalized root mean squagred between the data - and a model for a given set of parameters""" - s = 0.0 - for x,y in self.data : - yResidual = y-self.model(x, params) - s += yResidual**2 - s /= float(len(self.data)) - return s - def fit(self, x0=None) : - """Minimize error given an initial guess vector x0, the heart of - the class.""" - if x0 == None : - x0 = self.guessInitialParams() - xopt = scipy.optimize.fmin_powell(self.nrms, x0) - return xopt - def printModel(self) : - "OVERRIDE: Give the user some idea of what model we're fitting with" - print "model string not defined" - def printParams(self) : - "OVERRIDE: Give the user nicely named format for a parameter vector" - for i,p in zip(range(len(params)), params) : - print "param_%d\t=\t%g\n" % (i,p) - def model(self, x, params) : - "OVERRIDE: Implement your model" - assert len(params) == self.numParams - raise Exception, "model method not defined" - return 1.0 - def guessInitialParams(self) : - """OVERRIDE: Provide some intelligent heuristic to pick a - starting point for minimizing your model""" - return [1.0] * self.numParams - -def readData(fileObj) : - """Read histogram data from a whitespace-seperated, two-column - ASCII text-file into a list of tuples""" - data = [] - for line in fileObj : - if line[0] == "#" : continue - data.append([float(x) for x in line.split()]) - return data - -def run(numParams, modelClass) : - import sys - data = readData(sys.stdin) - mod = modelClass(numParams, data) - popt = mod.fit() - mod.printModel() - mod.printParams(popt) - print "N:\t%g" % mod.N - print "bin:\t%g" % mod.binwidth -- 2.26.2