--- /dev/null
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2013 W. Trevor King <wking@drexel.edu>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this program. If not, see
+# <http://www.gnu.org/licenses/>.
+
+"""Understanding principal component analysis and varimax rotation
+
+Based Hervé Abdi's example [1] and Trevor Park's implementation notes
+[2] for varimax rotation.
+
+>>> import numpy
+>>> scores = numpy.array( # five wines scored with seven tests
+... [[14, 7, 8, 7, 7, 13, 7],
+... [10, 7, 6, 4, 3, 14, 7],
+... [ 8, 5, 6, 10, 5, 12, 5],
+... [ 2, 4, 7, 16, 7, 11, 3],
+... [ 6, 2, 4, 13, 3, 10, 3]])
+>>> A = numpy.array( # original loadings (after PCA)
+... [[-0.3965, 0.1149], # hedonic
+... [-0.4454, -0.1090], # for meat
+... [-0.2646, -0.5854], # for dessert
+... [ 0.4160, -0.3111], # price
+... [-0.0485, -0.7245], # sugar
+... [-0.4385, 0.0555], # alcohol
+... [-0.4547, 0.0865]]) # acidity
+>>> print(varimax_criterion(A).round(5))
+0.02112
+
+>>> Ar = varimax(A)
+>>> Ar.round(4)
+array([[-0.4117, 0.03 ],
+ [-0.413 , -0.1991],
+ [-0.1372, -0.6276],
+ [ 0.4716, -0.2179],
+ [ 0.1031, -0.7188],
+ [-0.4405, -0.0368],
+ [-0.4627, -0.0098]])
+>>> print(varimax_criterion(Ar).round(5))
+0.02367
+
+This is slightly better than Abdi's rotation (his table 3):
+
+>>> Az = numpy.array( # original loadings (after PCA)
+... [[-0.4125, 0.0153], # hedonic
+... [-0.4057, -0.2138], # for meat
+... [-0.1147, -0.6321], # for dessert
+... [ 0.4790, -0.2010], # price
+... [ 0.1286, -0.7146], # sugar
+... [-0.4389, -0.0525], # alcohol
+... [-0.4620, -0.0264]]) # acidity
+>>> print(varimax_criterion(Az).round(5))
+0.02359
+
+There may be some copy/paste or rounding errors in Abdi's table 3,
+because he claims a rotation of -15 degrees in his text (which matches
+my calculated value), but his table 3 has only been rotated by -14
+degrees.
+
+>>> for i,(a,ar,az) in enumerate(zip(A, Ar, Az)):
+... theta_a = numpy.arctan2(a[1], a[0])
+... theta_r = numpy.arctan2(ar[1], ar[0])
+... theta_z = numpy.arctan2(az[1], az[0])
+... # (theta_init - theta_final) so we can print *axis* rotation
+... dr = ((theta_a - theta_r) * 180 / numpy.pi) % 360 - 360
+... dz = ((theta_a - theta_z) * 180 / numpy.pi) % 360 - 360
+... print("{}: {:.2f} (vs Abdi's {:.2f})".format(i, dr, dz))
+0: -11.99 (vs Abdi's -14.04)
+1: -11.99 (vs Abdi's -14.04)
+2: -11.99 (vs Abdi's -14.04)
+3: -11.99 (vs Abdi's -14.03)
+4: -11.99 (vs Abdi's -14.03)
+5: -11.99 (vs Abdi's -14.03)
+6: -11.99 (vs Abdi's -14.04)
+
+These loading matricies all have the variance stripped out:
+
+>>> print_row(eigenvalues(A), precision=3)
+ 1.000 1.000
+>>> print_row(eigenvalues(Ar), precision=3)
+ 1.000 1.000
+>>> print_row(eigenvalues(Az), precision=3)
+ 1.000 1.000
+
+As an additional test, let's try to reproduce the factor loadings from
+the raw scores:
+
+>>> evals,Ai = pca(scores, output_dim=2)
+>>> print_row(evals, precision=3)
+ 4.799 1.846 0.352 0.003 0.000
+>>> varimax(Ai).round(4)
+array([[-0.4113, -0.0306],
+ [-0.4111, 0.1994],
+ [-0.1435, 0.6274],
+ [ 0.4723, 0.218 ],
+ [ 0.1076, 0.7188],
+ [-0.4396, 0.0372],
+ [-0.4619, 0.0098]])
+
+This is quite similar (modulo column sign) to our earlier rotation of
+Abdi's PCA results, and the eigenvalues are close to his (4.7627,
+1.8101, 0.3527, and 0.0744).
+
+>>> Ar.round(4)
+array([[-0.4117, 0.03 ],
+ [-0.413 , -0.1991],
+ [-0.1372, -0.6276],
+ [ 0.4716, -0.2179],
+ [ 0.1031, -0.7188],
+ [-0.4405, -0.0368],
+ [-0.4627, -0.0098]])
+
+The fraction of the variance explained by the first two factors is:
+
+>>> print(format_number(explained_variance(evals)[:2].sum()))
+ 0.95
+
+which is reasonably close to Abdi's claimed 94%. The communalities
+will be:
+
+>>> print_row(communalities(Ai))
+ 0.17 0.21 0.41 0.27 0.53 0.19 0.21
+
+These loading matricies all have the variance stripped out:
+
+>>> print_row(eigenvalues(Ai), precision=3)
+ 1.000 1.000
+>>> print_row(eigenvalues(Ar), precision=3)
+ 1.000 1.000
+
+Checks against Andrew Wiesner
+-----------------------------
+
+Checking against the example loadings given by Andrew Wiesner [3]:
+
+>>> A = numpy.array(
+... [[0.286, 0.076, 0.841],
+... [0.698, 0.153, 0.084],
+... [0.744, -0.410, -0.020],
+... [0.471, 0.522, 0.135],
+... [0.681, -0.156, -0.148],
+... [0.498, -0.498, -0.253],
+... [0.861, -0.115, 0.011],
+... [0.642, 0.322, 0.044],
+... [0.298, 0.595, -0.533]])
+
+Wiesner also gives the eigenvalues [3],
+
+>>> evals = numpy.array(
+... [3.2978,
+... 1.2136,
+... 1.1055,
+... 0.9073,
+... 0.8606,
+... 0.5622,
+... 0.4838,
+... 0.3181,
+... 0.2511])
+
+communalities [4],
+
+>>> com_z = numpy.array(
+... [0.79500707,
+... 0.51783185,
+... 0.72230182,
+... 0.51244913,
+... 0.50977159,
+... 0.56073895,
+... 0.75382091,
+... 0.51725940,
+... 0.72770402])
+
+and loadings [4]:
+
+>>> Az = numpy.array(
+... [[ 0.021, 0.239, 0.859],
+... [ 0.483, 0.547, 0.166],
+... [ 0.829, 0.127, 0.137],
+... [ 0.031, 0.702, 0.139],
+... [ 0.652, 0.289, -0.028],
+... [ 0.734, -0.094, -0.117],
+... [ 0.738, 0.432, 0.150],
+... [ 0.301, 0.656, 0.099],
+... [-0.022, 0.651, -0.551]])
+
+Basic stuff matches almost exactly with our calculations [3]:
+
+>>> print_row(explained_variance(evals), precision=3)
+ 0.366 0.135 0.123 0.101 0.096 0.062 0.054 0.035 0.028
+
+(Wiesner has 0.3664, 0.1348, 0.1228, 0.1008, 0.0956, 0.0625, 0.0538,
+0.0353, and 0.0279 [3])
+
+>>> print(format_number(explained_variance(evals)[:3].sum()))
+... # fraction of total varianced explained by the first 3 common factors
+ 0.62
+
+(Wiesner has 62% [3])
+
+>>> print_row(communalities(A), precision=3)
+ 0.795 0.518 0.722 0.513 0.510 0.560 0.755 0.518 0.727
+>>> print_row(com_z, precision=3)
+ 0.795 0.518 0.722 0.512 0.510 0.561 0.754 0.517 0.728
+
+To get the rotated loading to match, we need to scale by the
+eigenvalues:
+
+>>> Ar = varimax(A)
+>>> Ar.round(3)
+array([[ 0.078, -0.06 , 0.886],
+ [ 0.497, 0.423, 0.303],
+ [ 0.842, 0.006, 0.112],
+ [ 0.107, 0.611, 0.357],
+ [ 0.677, 0.228, 0.022],
+ [ 0.715, -0.11 , -0.193],
+ [ 0.784, 0.296, 0.229],
+ [ 0.368, 0.55 , 0.281],
+ [ 0.022, 0.795, -0.307]])
+
+but they're in the same ballpark, and we have a higher varimax
+criterion:
+
+>>> print(format_number(varimax_criterion(Ar), precision=4))
+ 0.0591
+>>> print(format_number(varimax_criterion(Az), precision=4))
+ 0.0552
+
+These loading matricies all have the variance still included:
+
+>>> print_row(evals, precision=3)
+ 3.298 1.214 1.105 0.907 0.861 0.562 0.484 0.318 0.251
+>>> print_row(eigenvalues(A), precision=3)
+ 3.298 1.213 1.105
+>>> print_row(eigenvalues(Ar), precision=3)
+ 2.693 1.643 1.280
+>>> print_row(eigenvalues(Az), precision=3)
+ 2.522 1.998 1.154
+
+Checks against Daniel Denis
+---------------------------
+
+Checking against the example PCA given by Daniel Denis [6]:
+
+>>> scores = numpy.array(
+... [[32, 64, 65, 67],
+... [61, 37, 62, 65],
+... [59, 40, 45, 43],
+... [36, 62, 34, 35],
+... [62, 46, 43, 40]])
+>>> evals,Ai = pca(scores, output_dim=2)
+>>> print_row(eigenvalues(Ai), precision=3)
+ 1.000 1.000
+>>> Ai *= numpy.sqrt(evals[:2]) # insert the variance
+>>> print_row(eigenvalues(Ai), precision=3)
+ 2.016 1.942
+>>> print_row(evals, precision=3)
+ 2.016 1.942 0.038 0.004
+
+(Denis has the exact same values on page 4)
+
+>>> print_row(explained_variance(evals))
+ 0.50 0.49 0.01 0.00
+
+(Denis has 50.408%, 48.538%, 0.945%, and 0.109%)
+>>> Ar = varimax(Ai)
+>>> Ar.round(4)
+array([[ 0.0872, -0.9877],
+ [ 0.0723, 0.9886],
+ [-0.9973, -0.0258],
+ [-0.9976, 0.0401]])
+
+This matches up well (modulo column sign) with Denis' loadings:
+
+>>> A = numpy.array(
+... [[-0.500, 0.856],
+... [ 0.357, -0.925],
+... [ 0.891, 0.449],
+... [ 0.919, 0.390]])
+>>> Az = varimax(A)
+>>> Az.round(4)
+array([[-0.0867, 0.9875],
+ [-0.0721, -0.9889],
+ [ 0.9974, 0.0256],
+ [ 0.9975, -0.0397]])
+
+Denis' loadings are much more tightly aligned, as you can see from the
+varimax criterion:
+
+>>> print(format_number(varimax_criterion(Ar), precision=4))
+ 0.2411
+>>> print(format_number(varimax_criterion(Az), precision=4))
+ 0.2411
+
+These loading matricies all have the variance still included (because
+we added it to our PCA loadings by hand):
+
+>>> print_row(eigenvalues(A), precision=3)
+ 2.016 1.942
+>>> print_row(eigenvalues(Az), precision=3)
+ 2.003 1.955
+>>> print_row(eigenvalues(Ai), precision=3)
+ 2.016 1.942
+>>> print_row(eigenvalues(Ar), precision=3)
+ 2.003 1.955
+
+Denis also has a separate varimax example (page 7):
+
+>>> A = numpy.array(
+... [[-0.400, 0.900],
+... [ 0.251, -0.947],
+... [ 0.932, 0.348],
+... [ 0.956, 0.286]])
+
+which he rotates into:
+
+>>> Az = numpy.array(
+... [[-0.086, -0.981],
+... [-0.071, 0.977],
+... [ 0.994, -0.026],
+... [ 0.997, 0.040]])
+
+This matches up well (modulo column sign) with our:
+
+>>> varimax(A).round(3)
+array([[-0.085, 0.981],
+ [-0.071, -0.977],
+ [ 0.995, 0.026],
+ [ 0.997, -0.041]])
+
+Checks against SPSS via Newcastle upon Tyne
+-------------------------------------------
+
+Putting everything together for some sample data from an old Newcastle
+page [7]:
+
+>>> scores = numpy.array(
+... [[1, 3, 4, 6, 7, 2, 4, 5],
+... [2, 3, 4, 3, 4, 6, 7, 6],
+... [4, 5, 6, 7, 7, 2, 3, 4],
+... [3, 4, 5, 6, 7, 3, 5, 4],
+... [2, 5, 5, 5, 6, 2, 4, 5],
+... [3, 4, 6, 7, 7, 4, 3, 5],
+... [2, 3, 6, 4, 5, 4, 4, 4],
+... [1, 3, 4, 5, 6, 3, 3, 4],
+... [3, 3, 5, 6, 6, 4, 4, 3],
+... [4, 4, 5, 6, 7, 4, 3, 4],
+... [2, 3, 6, 7, 5, 4, 4, 4],
+... [2, 3, 5, 7, 6, 3, 3, 3]])
+>>> analyze(scores, output_dim=3, unitary=False, delimiter=' ', missing=' '*6)
+#eig prop r-eig r-prop com f-0 f-1 f-2 mean std
+ 3.709 0.464 2.456 0.307 0.844 -0.071 -0.122 0.908 2.417 1.101
+ 1.478 0.185 2.072 0.259 0.918 -0.603 0.310 0.677 3.583 0.877
+ 1.361 0.170 2.020 0.252 0.735 0.075 -0.429 0.738 5.083 0.877
+ 0.600 0.075 0.768 -0.485 -0.659 0.313 5.750 1.424
+ 0.417 0.052 0.814 -0.837 -0.219 0.256 6.083 1.101
+ 0.281 0.035 0.865 0.909 0.170 0.102 3.417 1.287
+ 0.129 0.016 0.800 0.564 0.682 -0.131 3.917 1.287
+ 0.026 0.003 0.804 0.051 0.895 -0.032 4.250 0.957
+
+Which matchesthe values given on the Newcastle page (where I've
+unshuffled their rows to match the initial column ordering used for
+the scores):
+
+ 3.709 0.464 2.501 0.313 0.844 0.908 2.42 1.00
+ 1.478 0.185 2.045 0.256 0.918 0.637 0.634 3.58 0.79
+ 1.361 0.170 2.002 0.250 0.735 0.760 5.08 0.79
+ 0.600 0.075 0.768 -0.652 5.75 1.29
+ 0.417 0.052 0.814 0.845 6.08 1.00
+ 0.281 0.035 0.865 -0.901 3.42 1.16
+ 0.129 0.016 0.800 -0.560 0.684 3.92 1.16
+ 0.026 0.003 0.804 0.893 4.25 0.87
+
+Their standard deviations differ from mine because they were using one
+for the delta degrees of freedom while I was using the number of
+factors:
+
+>>> print_row(scores.std(axis=0, ddof=1))
+ 1.00 0.79 0.79 1.29 1.00 1.16 1.16 0.87
+
+[1]: http://www.utd.edu/~herve/Abdi-rotations-pretty.pdf
+[2]: http://www.stat.ufl.edu/~tpark/Research/about_varimax.html
+[3]: http://sites.stat.psu.edu/~ajw13/stat505/fa06/17_factor/06_factor_example.html
+[4]: http://sites.stat.psu.edu/~ajw13/stat505/fa06/17_factor/07_factor_commun.html
+[5]: http://sites.stat.psu.edu/~ajw13/stat505/fa06/17_factor/13_factor_varimax.html
+[6]: http://psychweb.psy.umt.edu/denis/datadecision/factor/dd_fa_part_2_aug_2009.pdf
+[7]: http://web.archive.org/web/20051125011642/http://www.ncl.ac.uk/iss/statistics/docs/factoranalysis.html
+"""
+
+import itertools as _itertools
+
+import numpy as _numpy
+
+
+def format_number(x, precision=2):
+ if isinstance(x, str):
+ return x
+ elif _numpy.isnan(x) or isinstance(x, int):
+ return str(x)
+ format_string = '{{: .{}f}}'.format(precision)
+ return format_string.format(x)
+
+
+def print_row(row, delimiter=' ', precision=2):
+ print(delimiter.join(format_number(x, precision=precision) for x in row))
+
+
+def print_matrix(matrix, delimiter=' ', precision=2):
+ print('\n'.join(delimiter.join(format_number(x, precision=precision)
+ for x in row)
+ for row in matrix))
+
+
+def varimax_criterion(A):
+ """Return the varimax criterion of a factor-loading matrix A
+
+ V(A) = 1/k ∑ⱼ (1/d ∑ᵢ Aᵢⱼ⁴ - (1/d ∑ᵢ Aᵢⱼ²)²)
+ = 1/k ∑ⱼ (<Aᵢⱼ⁴>ᵢ - <Aᵢⱼ²>ᵢ²)
+ = var(A²)
+ """
+ return _numpy.var(A**2)
+
+
+def pca(scores, ddof=0, output_dim=None):
+ """Principal component analysis via singular value decomposition
+
+ Return an array of eigenvalues (variance scaling) and an
+ orthogonal matrix of unitary eigenvectors (the factor loadings).
+ """
+ X = scores
+ d,k = X.shape
+ X = (X - X.mean(axis=0)) / X.std(axis=0, ddof=0) / _numpy.sqrt(d-ddof)
+ U,s,V = _numpy.linalg.svd(X, full_matrices=False)
+ S = _numpy.diag(s)
+ G = U.dot(_numpy.diag(S))
+ ss = _numpy.sort(s)[::-1]
+ assert (s == ss).all(), (s, ss, s-ss)
+ if output_dim is None:
+ output_dim = V.T.shape[1]
+ eigenvalues = s**2 # see explained_variance.__doc__
+ return (eigenvalues, V.T[:,:output_dim])
+
+
+def communalities(A):
+ """Communalities for each measured attribute
+
+ This how well the model fits each attribute. It's similar to the
+ coefficient of determination (R²) for linear regressions.
+
+ http://sites.stat.psu.edu/~ajw13/stat505/fa06/17_factor/07_factor_commun.html
+ """
+ return (A**2).sum(axis=1).T
+
+
+def eigenvalues(A):
+ """Extract the measured attribute variance from the loading matrix
+
+ Some loading matricies have the measured attribute variance still
+ built into the loading terms (like covariance), while some have
+ the variance scaled out of them (i.e. unit eigenvectors, like
+ correlations). If the variance is built in, this function find
+ it.
+ """
+ return (A**2).sum(axis=0)
+
+
+def sort_eigenvalues(A):
+ """Sort non-unitary eigenvalues in decreasing magnitude
+ """
+ evals = eigenvalues(A)
+ B = 0*A
+ for i,j in enumerate(reversed(evals.argsort())):
+ B[:,i] = A[:,j]
+ return B
+
+
+def reduce_double_negatives(A):
+ """Flip eigenvalue sign to make largest component positive
+ """
+ for i in range(A.shape[1]):
+ if abs(A[:,i].min()) > A[:,i].max():
+ A[:,i] *= -1
+ return A
+
+
+def explained_variance(eigenvalues, total=None):
+ """Fraction of explained variance for each measured attribute
+
+ The common factors have unit variance, so the (squared)
+ eigenvalues of the factors give the variance of the modeled
+ attributes. "Squared" is in parenthesis, because you need to
+ square the eigenvalues calculated using the score SVD (converting
+ standard deviations to variance), but you don't need to square
+ eigenvalues calculated from the covariance matrix.
+ """
+ if total is None:
+ total = eigenvalues.sum()
+ return eigenvalues / total
+
+
+def varimax(A, max_iterations=None, epsilon=1e-7):
+ """Return the varimax-rotated version of a factor-loading matrix A
+
+ z = 1/d ∑ⱼ (Aⱼᵤ + iAⱼᵥ)⁴ - (1/d ∑ⱼ (Aⱼᵤ + iAⱼᵥ)²)²
+ φ* = 1/4 ∠(z)
+ """
+ d,k = A.shape
+ od = 1.0/d
+ if k == 1: # only one common factor, so there's nothing to rotate
+ return A
+ if max_iterations is None:
+ if k == 2: # only one combination, so repetition is redundant
+ max_iterations = 1
+ else:
+ max_iterations = 50
+ ovc = varimax_criterion(A)
+ for i in range(max_iterations):
+ for p,q in _itertools.combinations(range(k), 2):
+ Aj = _numpy.array([_numpy.complex(Ajp,Ajq)
+ for Ajp,Ajq in zip(A[:,p], A[:,q])])
+ z = od * (Aj**4).sum() - (od * (Aj**2).sum())**2
+ phi = 0.25 * _numpy.angle(z)
+ R = _numpy.eye(k)
+ R[p,p] = R[q,q] = _numpy.cos(phi)
+ R[q,p] = _numpy.sin(phi)
+ R[p,q] = - R[q,p]
+ A = A.dot(R)
+ vc = varimax_criterion(A)
+ if (vc - ovc) < epsilon * vc:
+ return A # converged :)
+ ovc = vc
+ # didn't converge (if k != 2) :(
+ return A
+
+
+def analyze(scores, output_dim=3, unitary=False, delimiter='\t', missing=''):
+ d,k = scores.shape
+ masked_scores = _numpy.ma.array(scores, mask=_numpy.isnan(scores))
+ mu = masked_scores.mean(axis=0)
+ std = masked_scores.std(axis=0, ddof=output_dim)
+ masked_scores = (masked_scores - mu).filled(0)
+ evals,A = pca(masked_scores, output_dim=output_dim)
+ if not unitary:
+ A *= _numpy.sqrt(evals[:output_dim])
+ A = varimax(A)
+ if not unitary:
+ A = sort_eigenvalues(A)
+ A = reduce_double_negatives(A)
+ print('#' + delimiter.join(
+ [
+ 'eig', 'prop', 'r-eig', 'r-prop', 'com'
+ ] + [
+ 'f-{}'.format(i) for i in range(output_dim)
+ ] + [
+ 'mean', 'std']))
+ r_evals = eigenvalues(A)
+ explained = explained_variance(evals)
+ r_explained = explained_variance(r_evals, total=evals.sum())
+ com = communalities(A)
+ for i in range(k):
+ r_eval = r_exp = missing
+ if i < len(r_evals):
+ r_eval = r_evals[i]
+ r_exp = r_explained[i]
+ row = [
+ evals[i],
+ explained[i],
+ r_eval,
+ r_exp,
+ com[i],
+ ] + list(A[i,:]) + [
+ mu[i],
+ std[i],
+ ]
+ print_row(row, delimiter=delimiter, precision=3)
+
+
+if __name__ == '__main__':
+ import argparse
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '-n', dest='output_dim', type=int, default=1,
+ help='number of common factors to use')
+ parser.add_argument(
+ '-u', dest='unitary', default=False, action='store_const', const=True,
+ help='use unitary eigenvalues for factor loading')
+ parser.add_argument(
+ 'path',
+ help='path to the tab-delimited data file to analyze')
+
+ args = parser.parse_args()
+
+ scores = _numpy.genfromtxt(fname=args.path, delimiter='\t')
+ analyze(scores=scores, output_dim=args.output_dim, unitary=args.unitary)