From: W. Trevor King Date: Mon, 18 Mar 2013 01:32:44 +0000 (-0400) Subject: pca.py: Add principal component analysis script X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=8b96b50d071c70c21ab4006be9ea77336f65fd8b;p=blog.git pca.py: Add principal component analysis script A good deal of this script is a huge docstring full of tests comparing my analysis with other examples I found online. You can run the doctests with: $ python -m doctest pca.py References: Hervé Abdi. Factor Rotations in Factor Analyses. In: Lewis-Beck M., Bryman, A., Futing T. (Eds.) (2003). Encyclopedia of Social Sciences Research Methods. Thousand Oaks (CA): Sage. Trevor Park. A Note on Terse Coding of Kaiser's Varimax Rotation Using Complex Number Representation. (March 2003). Andrew Wiesner's factor analysis notes. Daniel Denis's factor analysis notes. (2009). Archived version of the University of Newcastle upon Tyne's "How to Perform and Interpret Factor Analysis using SPSS". (November 2005). --- diff --git a/posts/Factor_analysis/pca.py b/posts/Factor_analysis/pca.py new file mode 100755 index 0000000..03d1639 --- /dev/null +++ b/posts/Factor_analysis/pca.py @@ -0,0 +1,607 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2013 W. Trevor King +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this program. If not, see +# . + +"""Understanding principal component analysis and varimax rotation + +Based Hervé Abdi's example [1] and Trevor Park's implementation notes +[2] for varimax rotation. + +>>> import numpy +>>> scores = numpy.array( # five wines scored with seven tests +... [[14, 7, 8, 7, 7, 13, 7], +... [10, 7, 6, 4, 3, 14, 7], +... [ 8, 5, 6, 10, 5, 12, 5], +... [ 2, 4, 7, 16, 7, 11, 3], +... [ 6, 2, 4, 13, 3, 10, 3]]) +>>> A = numpy.array( # original loadings (after PCA) +... [[-0.3965, 0.1149], # hedonic +... [-0.4454, -0.1090], # for meat +... [-0.2646, -0.5854], # for dessert +... [ 0.4160, -0.3111], # price +... [-0.0485, -0.7245], # sugar +... [-0.4385, 0.0555], # alcohol +... [-0.4547, 0.0865]]) # acidity +>>> print(varimax_criterion(A).round(5)) +0.02112 + +>>> Ar = varimax(A) +>>> Ar.round(4) +array([[-0.4117, 0.03 ], + [-0.413 , -0.1991], + [-0.1372, -0.6276], + [ 0.4716, -0.2179], + [ 0.1031, -0.7188], + [-0.4405, -0.0368], + [-0.4627, -0.0098]]) +>>> print(varimax_criterion(Ar).round(5)) +0.02367 + +This is slightly better than Abdi's rotation (his table 3): + +>>> Az = numpy.array( # original loadings (after PCA) +... [[-0.4125, 0.0153], # hedonic +... [-0.4057, -0.2138], # for meat +... [-0.1147, -0.6321], # for dessert +... [ 0.4790, -0.2010], # price +... [ 0.1286, -0.7146], # sugar +... [-0.4389, -0.0525], # alcohol +... [-0.4620, -0.0264]]) # acidity +>>> print(varimax_criterion(Az).round(5)) +0.02359 + +There may be some copy/paste or rounding errors in Abdi's table 3, +because he claims a rotation of -15 degrees in his text (which matches +my calculated value), but his table 3 has only been rotated by -14 +degrees. + +>>> for i,(a,ar,az) in enumerate(zip(A, Ar, Az)): +... theta_a = numpy.arctan2(a[1], a[0]) +... theta_r = numpy.arctan2(ar[1], ar[0]) +... theta_z = numpy.arctan2(az[1], az[0]) +... # (theta_init - theta_final) so we can print *axis* rotation +... dr = ((theta_a - theta_r) * 180 / numpy.pi) % 360 - 360 +... dz = ((theta_a - theta_z) * 180 / numpy.pi) % 360 - 360 +... print("{}: {:.2f} (vs Abdi's {:.2f})".format(i, dr, dz)) +0: -11.99 (vs Abdi's -14.04) +1: -11.99 (vs Abdi's -14.04) +2: -11.99 (vs Abdi's -14.04) +3: -11.99 (vs Abdi's -14.03) +4: -11.99 (vs Abdi's -14.03) +5: -11.99 (vs Abdi's -14.03) +6: -11.99 (vs Abdi's -14.04) + +These loading matricies all have the variance stripped out: + +>>> print_row(eigenvalues(A), precision=3) + 1.000 1.000 +>>> print_row(eigenvalues(Ar), precision=3) + 1.000 1.000 +>>> print_row(eigenvalues(Az), precision=3) + 1.000 1.000 + +As an additional test, let's try to reproduce the factor loadings from +the raw scores: + +>>> evals,Ai = pca(scores, output_dim=2) +>>> print_row(evals, precision=3) + 4.799 1.846 0.352 0.003 0.000 +>>> varimax(Ai).round(4) +array([[-0.4113, -0.0306], + [-0.4111, 0.1994], + [-0.1435, 0.6274], + [ 0.4723, 0.218 ], + [ 0.1076, 0.7188], + [-0.4396, 0.0372], + [-0.4619, 0.0098]]) + +This is quite similar (modulo column sign) to our earlier rotation of +Abdi's PCA results, and the eigenvalues are close to his (4.7627, +1.8101, 0.3527, and 0.0744). + +>>> Ar.round(4) +array([[-0.4117, 0.03 ], + [-0.413 , -0.1991], + [-0.1372, -0.6276], + [ 0.4716, -0.2179], + [ 0.1031, -0.7188], + [-0.4405, -0.0368], + [-0.4627, -0.0098]]) + +The fraction of the variance explained by the first two factors is: + +>>> print(format_number(explained_variance(evals)[:2].sum())) + 0.95 + +which is reasonably close to Abdi's claimed 94%. The communalities +will be: + +>>> print_row(communalities(Ai)) + 0.17 0.21 0.41 0.27 0.53 0.19 0.21 + +These loading matricies all have the variance stripped out: + +>>> print_row(eigenvalues(Ai), precision=3) + 1.000 1.000 +>>> print_row(eigenvalues(Ar), precision=3) + 1.000 1.000 + +Checks against Andrew Wiesner +----------------------------- + +Checking against the example loadings given by Andrew Wiesner [3]: + +>>> A = numpy.array( +... [[0.286, 0.076, 0.841], +... [0.698, 0.153, 0.084], +... [0.744, -0.410, -0.020], +... [0.471, 0.522, 0.135], +... [0.681, -0.156, -0.148], +... [0.498, -0.498, -0.253], +... [0.861, -0.115, 0.011], +... [0.642, 0.322, 0.044], +... [0.298, 0.595, -0.533]]) + +Wiesner also gives the eigenvalues [3], + +>>> evals = numpy.array( +... [3.2978, +... 1.2136, +... 1.1055, +... 0.9073, +... 0.8606, +... 0.5622, +... 0.4838, +... 0.3181, +... 0.2511]) + +communalities [4], + +>>> com_z = numpy.array( +... [0.79500707, +... 0.51783185, +... 0.72230182, +... 0.51244913, +... 0.50977159, +... 0.56073895, +... 0.75382091, +... 0.51725940, +... 0.72770402]) + +and loadings [4]: + +>>> Az = numpy.array( +... [[ 0.021, 0.239, 0.859], +... [ 0.483, 0.547, 0.166], +... [ 0.829, 0.127, 0.137], +... [ 0.031, 0.702, 0.139], +... [ 0.652, 0.289, -0.028], +... [ 0.734, -0.094, -0.117], +... [ 0.738, 0.432, 0.150], +... [ 0.301, 0.656, 0.099], +... [-0.022, 0.651, -0.551]]) + +Basic stuff matches almost exactly with our calculations [3]: + +>>> print_row(explained_variance(evals), precision=3) + 0.366 0.135 0.123 0.101 0.096 0.062 0.054 0.035 0.028 + +(Wiesner has 0.3664, 0.1348, 0.1228, 0.1008, 0.0956, 0.0625, 0.0538, +0.0353, and 0.0279 [3]) + +>>> print(format_number(explained_variance(evals)[:3].sum())) +... # fraction of total varianced explained by the first 3 common factors + 0.62 + +(Wiesner has 62% [3]) + +>>> print_row(communalities(A), precision=3) + 0.795 0.518 0.722 0.513 0.510 0.560 0.755 0.518 0.727 +>>> print_row(com_z, precision=3) + 0.795 0.518 0.722 0.512 0.510 0.561 0.754 0.517 0.728 + +To get the rotated loading to match, we need to scale by the +eigenvalues: + +>>> Ar = varimax(A) +>>> Ar.round(3) +array([[ 0.078, -0.06 , 0.886], + [ 0.497, 0.423, 0.303], + [ 0.842, 0.006, 0.112], + [ 0.107, 0.611, 0.357], + [ 0.677, 0.228, 0.022], + [ 0.715, -0.11 , -0.193], + [ 0.784, 0.296, 0.229], + [ 0.368, 0.55 , 0.281], + [ 0.022, 0.795, -0.307]]) + +but they're in the same ballpark, and we have a higher varimax +criterion: + +>>> print(format_number(varimax_criterion(Ar), precision=4)) + 0.0591 +>>> print(format_number(varimax_criterion(Az), precision=4)) + 0.0552 + +These loading matricies all have the variance still included: + +>>> print_row(evals, precision=3) + 3.298 1.214 1.105 0.907 0.861 0.562 0.484 0.318 0.251 +>>> print_row(eigenvalues(A), precision=3) + 3.298 1.213 1.105 +>>> print_row(eigenvalues(Ar), precision=3) + 2.693 1.643 1.280 +>>> print_row(eigenvalues(Az), precision=3) + 2.522 1.998 1.154 + +Checks against Daniel Denis +--------------------------- + +Checking against the example PCA given by Daniel Denis [6]: + +>>> scores = numpy.array( +... [[32, 64, 65, 67], +... [61, 37, 62, 65], +... [59, 40, 45, 43], +... [36, 62, 34, 35], +... [62, 46, 43, 40]]) +>>> evals,Ai = pca(scores, output_dim=2) +>>> print_row(eigenvalues(Ai), precision=3) + 1.000 1.000 +>>> Ai *= numpy.sqrt(evals[:2]) # insert the variance +>>> print_row(eigenvalues(Ai), precision=3) + 2.016 1.942 +>>> print_row(evals, precision=3) + 2.016 1.942 0.038 0.004 + +(Denis has the exact same values on page 4) + +>>> print_row(explained_variance(evals)) + 0.50 0.49 0.01 0.00 + +(Denis has 50.408%, 48.538%, 0.945%, and 0.109%) +>>> Ar = varimax(Ai) +>>> Ar.round(4) +array([[ 0.0872, -0.9877], + [ 0.0723, 0.9886], + [-0.9973, -0.0258], + [-0.9976, 0.0401]]) + +This matches up well (modulo column sign) with Denis' loadings: + +>>> A = numpy.array( +... [[-0.500, 0.856], +... [ 0.357, -0.925], +... [ 0.891, 0.449], +... [ 0.919, 0.390]]) +>>> Az = varimax(A) +>>> Az.round(4) +array([[-0.0867, 0.9875], + [-0.0721, -0.9889], + [ 0.9974, 0.0256], + [ 0.9975, -0.0397]]) + +Denis' loadings are much more tightly aligned, as you can see from the +varimax criterion: + +>>> print(format_number(varimax_criterion(Ar), precision=4)) + 0.2411 +>>> print(format_number(varimax_criterion(Az), precision=4)) + 0.2411 + +These loading matricies all have the variance still included (because +we added it to our PCA loadings by hand): + +>>> print_row(eigenvalues(A), precision=3) + 2.016 1.942 +>>> print_row(eigenvalues(Az), precision=3) + 2.003 1.955 +>>> print_row(eigenvalues(Ai), precision=3) + 2.016 1.942 +>>> print_row(eigenvalues(Ar), precision=3) + 2.003 1.955 + +Denis also has a separate varimax example (page 7): + +>>> A = numpy.array( +... [[-0.400, 0.900], +... [ 0.251, -0.947], +... [ 0.932, 0.348], +... [ 0.956, 0.286]]) + +which he rotates into: + +>>> Az = numpy.array( +... [[-0.086, -0.981], +... [-0.071, 0.977], +... [ 0.994, -0.026], +... [ 0.997, 0.040]]) + +This matches up well (modulo column sign) with our: + +>>> varimax(A).round(3) +array([[-0.085, 0.981], + [-0.071, -0.977], + [ 0.995, 0.026], + [ 0.997, -0.041]]) + +Checks against SPSS via Newcastle upon Tyne +------------------------------------------- + +Putting everything together for some sample data from an old Newcastle +page [7]: + +>>> scores = numpy.array( +... [[1, 3, 4, 6, 7, 2, 4, 5], +... [2, 3, 4, 3, 4, 6, 7, 6], +... [4, 5, 6, 7, 7, 2, 3, 4], +... [3, 4, 5, 6, 7, 3, 5, 4], +... [2, 5, 5, 5, 6, 2, 4, 5], +... [3, 4, 6, 7, 7, 4, 3, 5], +... [2, 3, 6, 4, 5, 4, 4, 4], +... [1, 3, 4, 5, 6, 3, 3, 4], +... [3, 3, 5, 6, 6, 4, 4, 3], +... [4, 4, 5, 6, 7, 4, 3, 4], +... [2, 3, 6, 7, 5, 4, 4, 4], +... [2, 3, 5, 7, 6, 3, 3, 3]]) +>>> analyze(scores, output_dim=3, unitary=False, delimiter=' ', missing=' '*6) +#eig prop r-eig r-prop com f-0 f-1 f-2 mean std + 3.709 0.464 2.456 0.307 0.844 -0.071 -0.122 0.908 2.417 1.101 + 1.478 0.185 2.072 0.259 0.918 -0.603 0.310 0.677 3.583 0.877 + 1.361 0.170 2.020 0.252 0.735 0.075 -0.429 0.738 5.083 0.877 + 0.600 0.075 0.768 -0.485 -0.659 0.313 5.750 1.424 + 0.417 0.052 0.814 -0.837 -0.219 0.256 6.083 1.101 + 0.281 0.035 0.865 0.909 0.170 0.102 3.417 1.287 + 0.129 0.016 0.800 0.564 0.682 -0.131 3.917 1.287 + 0.026 0.003 0.804 0.051 0.895 -0.032 4.250 0.957 + +Which matchesthe values given on the Newcastle page (where I've +unshuffled their rows to match the initial column ordering used for +the scores): + + 3.709 0.464 2.501 0.313 0.844 0.908 2.42 1.00 + 1.478 0.185 2.045 0.256 0.918 0.637 0.634 3.58 0.79 + 1.361 0.170 2.002 0.250 0.735 0.760 5.08 0.79 + 0.600 0.075 0.768 -0.652 5.75 1.29 + 0.417 0.052 0.814 0.845 6.08 1.00 + 0.281 0.035 0.865 -0.901 3.42 1.16 + 0.129 0.016 0.800 -0.560 0.684 3.92 1.16 + 0.026 0.003 0.804 0.893 4.25 0.87 + +Their standard deviations differ from mine because they were using one +for the delta degrees of freedom while I was using the number of +factors: + +>>> print_row(scores.std(axis=0, ddof=1)) + 1.00 0.79 0.79 1.29 1.00 1.16 1.16 0.87 + +[1]: http://www.utd.edu/~herve/Abdi-rotations-pretty.pdf +[2]: http://www.stat.ufl.edu/~tpark/Research/about_varimax.html +[3]: http://sites.stat.psu.edu/~ajw13/stat505/fa06/17_factor/06_factor_example.html +[4]: http://sites.stat.psu.edu/~ajw13/stat505/fa06/17_factor/07_factor_commun.html +[5]: http://sites.stat.psu.edu/~ajw13/stat505/fa06/17_factor/13_factor_varimax.html +[6]: http://psychweb.psy.umt.edu/denis/datadecision/factor/dd_fa_part_2_aug_2009.pdf +[7]: http://web.archive.org/web/20051125011642/http://www.ncl.ac.uk/iss/statistics/docs/factoranalysis.html +""" + +import itertools as _itertools + +import numpy as _numpy + + +def format_number(x, precision=2): + if isinstance(x, str): + return x + elif _numpy.isnan(x) or isinstance(x, int): + return str(x) + format_string = '{{: .{}f}}'.format(precision) + return format_string.format(x) + + +def print_row(row, delimiter=' ', precision=2): + print(delimiter.join(format_number(x, precision=precision) for x in row)) + + +def print_matrix(matrix, delimiter=' ', precision=2): + print('\n'.join(delimiter.join(format_number(x, precision=precision) + for x in row) + for row in matrix)) + + +def varimax_criterion(A): + """Return the varimax criterion of a factor-loading matrix A + + V(A) = 1/k ∑ⱼ (1/d ∑ᵢ Aᵢⱼ⁴ - (1/d ∑ᵢ Aᵢⱼ²)²) + = 1/k ∑ⱼ (áµ¢ - ᵢ²) + = var(A²) + """ + return _numpy.var(A**2) + + +def pca(scores, ddof=0, output_dim=None): + """Principal component analysis via singular value decomposition + + Return an array of eigenvalues (variance scaling) and an + orthogonal matrix of unitary eigenvectors (the factor loadings). + """ + X = scores + d,k = X.shape + X = (X - X.mean(axis=0)) / X.std(axis=0, ddof=0) / _numpy.sqrt(d-ddof) + U,s,V = _numpy.linalg.svd(X, full_matrices=False) + S = _numpy.diag(s) + G = U.dot(_numpy.diag(S)) + ss = _numpy.sort(s)[::-1] + assert (s == ss).all(), (s, ss, s-ss) + if output_dim is None: + output_dim = V.T.shape[1] + eigenvalues = s**2 # see explained_variance.__doc__ + return (eigenvalues, V.T[:,:output_dim]) + + +def communalities(A): + """Communalities for each measured attribute + + This how well the model fits each attribute. It's similar to the + coefficient of determination (R²) for linear regressions. + + http://sites.stat.psu.edu/~ajw13/stat505/fa06/17_factor/07_factor_commun.html + """ + return (A**2).sum(axis=1).T + + +def eigenvalues(A): + """Extract the measured attribute variance from the loading matrix + + Some loading matricies have the measured attribute variance still + built into the loading terms (like covariance), while some have + the variance scaled out of them (i.e. unit eigenvectors, like + correlations). If the variance is built in, this function find + it. + """ + return (A**2).sum(axis=0) + + +def sort_eigenvalues(A): + """Sort non-unitary eigenvalues in decreasing magnitude + """ + evals = eigenvalues(A) + B = 0*A + for i,j in enumerate(reversed(evals.argsort())): + B[:,i] = A[:,j] + return B + + +def reduce_double_negatives(A): + """Flip eigenvalue sign to make largest component positive + """ + for i in range(A.shape[1]): + if abs(A[:,i].min()) > A[:,i].max(): + A[:,i] *= -1 + return A + + +def explained_variance(eigenvalues, total=None): + """Fraction of explained variance for each measured attribute + + The common factors have unit variance, so the (squared) + eigenvalues of the factors give the variance of the modeled + attributes. "Squared" is in parenthesis, because you need to + square the eigenvalues calculated using the score SVD (converting + standard deviations to variance), but you don't need to square + eigenvalues calculated from the covariance matrix. + """ + if total is None: + total = eigenvalues.sum() + return eigenvalues / total + + +def varimax(A, max_iterations=None, epsilon=1e-7): + """Return the varimax-rotated version of a factor-loading matrix A + + z = 1/d ∑ⱼ (Aⱼᵤ + iAⱼᵥ)⁴ - (1/d ∑ⱼ (Aⱼᵤ + iAⱼᵥ)²)² + φ* = 1/4 ∠(z) + """ + d,k = A.shape + od = 1.0/d + if k == 1: # only one common factor, so there's nothing to rotate + return A + if max_iterations is None: + if k == 2: # only one combination, so repetition is redundant + max_iterations = 1 + else: + max_iterations = 50 + ovc = varimax_criterion(A) + for i in range(max_iterations): + for p,q in _itertools.combinations(range(k), 2): + Aj = _numpy.array([_numpy.complex(Ajp,Ajq) + for Ajp,Ajq in zip(A[:,p], A[:,q])]) + z = od * (Aj**4).sum() - (od * (Aj**2).sum())**2 + phi = 0.25 * _numpy.angle(z) + R = _numpy.eye(k) + R[p,p] = R[q,q] = _numpy.cos(phi) + R[q,p] = _numpy.sin(phi) + R[p,q] = - R[q,p] + A = A.dot(R) + vc = varimax_criterion(A) + if (vc - ovc) < epsilon * vc: + return A # converged :) + ovc = vc + # didn't converge (if k != 2) :( + return A + + +def analyze(scores, output_dim=3, unitary=False, delimiter='\t', missing=''): + d,k = scores.shape + masked_scores = _numpy.ma.array(scores, mask=_numpy.isnan(scores)) + mu = masked_scores.mean(axis=0) + std = masked_scores.std(axis=0, ddof=output_dim) + masked_scores = (masked_scores - mu).filled(0) + evals,A = pca(masked_scores, output_dim=output_dim) + if not unitary: + A *= _numpy.sqrt(evals[:output_dim]) + A = varimax(A) + if not unitary: + A = sort_eigenvalues(A) + A = reduce_double_negatives(A) + print('#' + delimiter.join( + [ + 'eig', 'prop', 'r-eig', 'r-prop', 'com' + ] + [ + 'f-{}'.format(i) for i in range(output_dim) + ] + [ + 'mean', 'std'])) + r_evals = eigenvalues(A) + explained = explained_variance(evals) + r_explained = explained_variance(r_evals, total=evals.sum()) + com = communalities(A) + for i in range(k): + r_eval = r_exp = missing + if i < len(r_evals): + r_eval = r_evals[i] + r_exp = r_explained[i] + row = [ + evals[i], + explained[i], + r_eval, + r_exp, + com[i], + ] + list(A[i,:]) + [ + mu[i], + std[i], + ] + print_row(row, delimiter=delimiter, precision=3) + + +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument( + '-n', dest='output_dim', type=int, default=1, + help='number of common factors to use') + parser.add_argument( + '-u', dest='unitary', default=False, action='store_const', const=True, + help='use unitary eigenvalues for factor loading') + parser.add_argument( + 'path', + help='path to the tab-delimited data file to analyze') + + args = parser.parse_args() + + scores = _numpy.genfromtxt(fname=args.path, delimiter='\t') + analyze(scores=scores, output_dim=args.output_dim, unitary=args.unitary)