2 # -*- coding: utf-8 -*-
4 # Copyright (C) 2013 W. Trevor King <wking@drexel.edu>
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU Lesser General Public License as
8 # published by the Free Software Foundation, either version 3 of the
9 # License, or (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # Lesser General Public License for more details.
16 # You should have received a copy of the GNU Lesser General Public
17 # License along with this program. If not, see
18 # <http://www.gnu.org/licenses/>.
20 """Understanding principal component analysis and varimax rotation
22 Based Hervé Abdi's example [1] and Trevor Park's implementation notes
23 [2] for varimax rotation.
26 >>> scores = numpy.array( # five wines scored with seven tests
27 ... [[14, 7, 8, 7, 7, 13, 7],
28 ... [10, 7, 6, 4, 3, 14, 7],
29 ... [ 8, 5, 6, 10, 5, 12, 5],
30 ... [ 2, 4, 7, 16, 7, 11, 3],
31 ... [ 6, 2, 4, 13, 3, 10, 3]])
32 >>> A = numpy.array( # original loadings (after PCA)
33 ... [[-0.3965, 0.1149], # hedonic
34 ... [-0.4454, -0.1090], # for meat
35 ... [-0.2646, -0.5854], # for dessert
36 ... [ 0.4160, -0.3111], # price
37 ... [-0.0485, -0.7245], # sugar
38 ... [-0.4385, 0.0555], # alcohol
39 ... [-0.4547, 0.0865]]) # acidity
40 >>> print(varimax_criterion(A).round(5))
45 array([[-0.4117, 0.03 ],
52 >>> print(varimax_criterion(Ar).round(5))
55 This is slightly better than Abdi's rotation (his table 3):
57 >>> Az = numpy.array( # original loadings (after PCA)
58 ... [[-0.4125, 0.0153], # hedonic
59 ... [-0.4057, -0.2138], # for meat
60 ... [-0.1147, -0.6321], # for dessert
61 ... [ 0.4790, -0.2010], # price
62 ... [ 0.1286, -0.7146], # sugar
63 ... [-0.4389, -0.0525], # alcohol
64 ... [-0.4620, -0.0264]]) # acidity
65 >>> print(varimax_criterion(Az).round(5))
68 There may be some copy/paste or rounding errors in Abdi's table 3,
69 because he claims a rotation of -15 degrees in his text (which matches
70 my calculated value), but his table 3 has only been rotated by -14
73 >>> for i,(a,ar,az) in enumerate(zip(A, Ar, Az)):
74 ... theta_a = numpy.arctan2(a[1], a[0])
75 ... theta_r = numpy.arctan2(ar[1], ar[0])
76 ... theta_z = numpy.arctan2(az[1], az[0])
77 ... # (theta_init - theta_final) so we can print *axis* rotation
78 ... dr = ((theta_a - theta_r) * 180 / numpy.pi) % 360 - 360
79 ... dz = ((theta_a - theta_z) * 180 / numpy.pi) % 360 - 360
80 ... print("{}: {:.2f} (vs Abdi's {:.2f})".format(i, dr, dz))
81 0: -11.99 (vs Abdi's -14.04)
82 1: -11.99 (vs Abdi's -14.04)
83 2: -11.99 (vs Abdi's -14.04)
84 3: -11.99 (vs Abdi's -14.03)
85 4: -11.99 (vs Abdi's -14.03)
86 5: -11.99 (vs Abdi's -14.03)
87 6: -11.99 (vs Abdi's -14.04)
89 These loading matricies all have the variance stripped out:
91 >>> print_row(eigenvalues(A), precision=3)
93 >>> print_row(eigenvalues(Ar), precision=3)
95 >>> print_row(eigenvalues(Az), precision=3)
98 As an additional test, let's try to reproduce the factor loadings from
101 >>> evals,Ai = pca(scores, output_dim=2)
102 >>> print_row(evals, precision=3)
103 4.799 1.846 0.352 0.003 0.000
104 >>> varimax(Ai).round(4)
105 array([[-0.4113, -0.0306],
113 This is quite similar (modulo column sign) to our earlier rotation of
114 Abdi's PCA results, and the eigenvalues are close to his (4.7627,
115 1.8101, 0.3527, and 0.0744).
118 array([[-0.4117, 0.03 ],
126 The fraction of the variance explained by the first two factors is:
128 >>> print(format_number(explained_variance(evals)[:2].sum()))
131 which is reasonably close to Abdi's claimed 94%. The communalities
134 >>> print_row(communalities(Ai))
135 0.17 0.21 0.41 0.27 0.53 0.19 0.21
137 These loading matricies all have the variance stripped out:
139 >>> print_row(eigenvalues(Ai), precision=3)
141 >>> print_row(eigenvalues(Ar), precision=3)
144 Checks against Andrew Wiesner
145 -----------------------------
147 Checking against the example loadings given by Andrew Wiesner [3]:
150 ... [[0.286, 0.076, 0.841],
151 ... [0.698, 0.153, 0.084],
152 ... [0.744, -0.410, -0.020],
153 ... [0.471, 0.522, 0.135],
154 ... [0.681, -0.156, -0.148],
155 ... [0.498, -0.498, -0.253],
156 ... [0.861, -0.115, 0.011],
157 ... [0.642, 0.322, 0.044],
158 ... [0.298, 0.595, -0.533]])
160 Wiesner also gives the eigenvalues [3],
162 >>> evals = numpy.array(
175 >>> com_z = numpy.array(
188 >>> Az = numpy.array(
189 ... [[ 0.021, 0.239, 0.859],
190 ... [ 0.483, 0.547, 0.166],
191 ... [ 0.829, 0.127, 0.137],
192 ... [ 0.031, 0.702, 0.139],
193 ... [ 0.652, 0.289, -0.028],
194 ... [ 0.734, -0.094, -0.117],
195 ... [ 0.738, 0.432, 0.150],
196 ... [ 0.301, 0.656, 0.099],
197 ... [-0.022, 0.651, -0.551]])
199 Basic stuff matches almost exactly with our calculations [3]:
201 >>> print_row(explained_variance(evals), precision=3)
202 0.366 0.135 0.123 0.101 0.096 0.062 0.054 0.035 0.028
204 (Wiesner has 0.3664, 0.1348, 0.1228, 0.1008, 0.0956, 0.0625, 0.0538,
205 0.0353, and 0.0279 [3])
207 >>> print(format_number(explained_variance(evals)[:3].sum()))
208 ... # fraction of total varianced explained by the first 3 common factors
211 (Wiesner has 62% [3])
213 >>> print_row(communalities(A), precision=3)
214 0.795 0.518 0.722 0.513 0.510 0.560 0.755 0.518 0.727
215 >>> print_row(com_z, precision=3)
216 0.795 0.518 0.722 0.512 0.510 0.561 0.754 0.517 0.728
218 To get the rotated loading to match, we need to scale by the
223 array([[ 0.078, -0.06 , 0.886],
224 [ 0.497, 0.423, 0.303],
225 [ 0.842, 0.006, 0.112],
226 [ 0.107, 0.611, 0.357],
227 [ 0.677, 0.228, 0.022],
228 [ 0.715, -0.11 , -0.193],
229 [ 0.784, 0.296, 0.229],
230 [ 0.368, 0.55 , 0.281],
231 [ 0.022, 0.795, -0.307]])
233 but they're in the same ballpark, and we have a higher varimax
236 >>> print(format_number(varimax_criterion(Ar), precision=4))
238 >>> print(format_number(varimax_criterion(Az), precision=4))
241 These loading matricies all have the variance still included:
243 >>> print_row(evals, precision=3)
244 3.298 1.214 1.105 0.907 0.861 0.562 0.484 0.318 0.251
245 >>> print_row(eigenvalues(A), precision=3)
247 >>> print_row(eigenvalues(Ar), precision=3)
249 >>> print_row(eigenvalues(Az), precision=3)
252 Checks against Daniel Denis
253 ---------------------------
255 Checking against the example PCA given by Daniel Denis [6]:
257 >>> scores = numpy.array(
258 ... [[32, 64, 65, 67],
259 ... [61, 37, 62, 65],
260 ... [59, 40, 45, 43],
261 ... [36, 62, 34, 35],
262 ... [62, 46, 43, 40]])
263 >>> evals,Ai = pca(scores, output_dim=2)
264 >>> print_row(eigenvalues(Ai), precision=3)
266 >>> Ai *= numpy.sqrt(evals[:2]) # insert the variance
267 >>> print_row(eigenvalues(Ai), precision=3)
269 >>> print_row(evals, precision=3)
270 2.016 1.942 0.038 0.004
272 (Denis has the exact same values on page 4)
274 >>> print_row(explained_variance(evals))
277 (Denis has 50.408%, 48.538%, 0.945%, and 0.109%)
280 array([[ 0.0872, -0.9877],
285 This matches up well (modulo column sign) with Denis' loadings:
288 ... [[-0.500, 0.856],
289 ... [ 0.357, -0.925],
291 ... [ 0.919, 0.390]])
294 array([[-0.0867, 0.9875],
299 Denis' loadings are much more tightly aligned, as you can see from the
302 >>> print(format_number(varimax_criterion(Ar), precision=4))
304 >>> print(format_number(varimax_criterion(Az), precision=4))
307 These loading matricies all have the variance still included (because
308 we added it to our PCA loadings by hand):
310 >>> print_row(eigenvalues(A), precision=3)
312 >>> print_row(eigenvalues(Az), precision=3)
314 >>> print_row(eigenvalues(Ai), precision=3)
316 >>> print_row(eigenvalues(Ar), precision=3)
319 Denis also has a separate varimax example (page 7):
322 ... [[-0.400, 0.900],
323 ... [ 0.251, -0.947],
325 ... [ 0.956, 0.286]])
327 which he rotates into:
329 >>> Az = numpy.array(
330 ... [[-0.086, -0.981],
332 ... [ 0.994, -0.026],
333 ... [ 0.997, 0.040]])
335 This matches up well (modulo column sign) with our:
337 >>> varimax(A).round(3)
338 array([[-0.085, 0.981],
343 Checks against SPSS via Newcastle upon Tyne
344 -------------------------------------------
346 Putting everything together for some sample data from an old Newcastle
349 >>> scores = numpy.array(
350 ... [[1, 3, 4, 6, 7, 2, 4, 5],
351 ... [2, 3, 4, 3, 4, 6, 7, 6],
352 ... [4, 5, 6, 7, 7, 2, 3, 4],
353 ... [3, 4, 5, 6, 7, 3, 5, 4],
354 ... [2, 5, 5, 5, 6, 2, 4, 5],
355 ... [3, 4, 6, 7, 7, 4, 3, 5],
356 ... [2, 3, 6, 4, 5, 4, 4, 4],
357 ... [1, 3, 4, 5, 6, 3, 3, 4],
358 ... [3, 3, 5, 6, 6, 4, 4, 3],
359 ... [4, 4, 5, 6, 7, 4, 3, 4],
360 ... [2, 3, 6, 7, 5, 4, 4, 4],
361 ... [2, 3, 5, 7, 6, 3, 3, 3]])
362 >>> analyze(scores, output_dim=3, unitary=False, delimiter=' ', missing=' '*6)
363 #eig prop r-eig r-prop com f-0 f-1 f-2 mean std
364 3.709 0.464 2.456 0.307 0.844 -0.071 -0.122 0.908 2.417 1.101
365 1.478 0.185 2.072 0.259 0.918 -0.603 0.310 0.677 3.583 0.877
366 1.361 0.170 2.020 0.252 0.735 0.075 -0.429 0.738 5.083 0.877
367 0.600 0.075 0.768 -0.485 -0.659 0.313 5.750 1.424
368 0.417 0.052 0.814 -0.837 -0.219 0.256 6.083 1.101
369 0.281 0.035 0.865 0.909 0.170 0.102 3.417 1.287
370 0.129 0.016 0.800 0.564 0.682 -0.131 3.917 1.287
371 0.026 0.003 0.804 0.051 0.895 -0.032 4.250 0.957
373 Which matchesthe values given on the Newcastle page (where I've
374 unshuffled their rows to match the initial column ordering used for
377 3.709 0.464 2.501 0.313 0.844 0.908 2.42 1.00
378 1.478 0.185 2.045 0.256 0.918 0.637 0.634 3.58 0.79
379 1.361 0.170 2.002 0.250 0.735 0.760 5.08 0.79
380 0.600 0.075 0.768 -0.652 5.75 1.29
381 0.417 0.052 0.814 0.845 6.08 1.00
382 0.281 0.035 0.865 -0.901 3.42 1.16
383 0.129 0.016 0.800 -0.560 0.684 3.92 1.16
384 0.026 0.003 0.804 0.893 4.25 0.87
386 Their standard deviations differ from mine because they were using one
387 for the delta degrees of freedom while I was using the number of
390 >>> print_row(scores.std(axis=0, ddof=1))
391 1.00 0.79 0.79 1.29 1.00 1.16 1.16 0.87
393 [1]: http://www.utd.edu/~herve/Abdi-rotations-pretty.pdf
394 [2]: http://www.stat.ufl.edu/~tpark/Research/about_varimax.html
395 [3]: http://sites.stat.psu.edu/~ajw13/stat505/fa06/17_factor/06_factor_example.html
396 [4]: http://sites.stat.psu.edu/~ajw13/stat505/fa06/17_factor/07_factor_commun.html
397 [5]: http://sites.stat.psu.edu/~ajw13/stat505/fa06/17_factor/13_factor_varimax.html
398 [6]: http://psychweb.psy.umt.edu/denis/datadecision/factor/dd_fa_part_2_aug_2009.pdf
399 [7]: http://web.archive.org/web/20051125011642/http://www.ncl.ac.uk/iss/statistics/docs/factoranalysis.html
402 import itertools as _itertools
404 import numpy as _numpy
407 def format_number(x, precision=2):
408 if isinstance(x, str):
410 elif _numpy.isnan(x) or isinstance(x, int):
412 format_string = '{{: .{}f}}'.format(precision)
413 return format_string.format(x)
416 def print_row(row, delimiter=' ', precision=2):
417 print(delimiter.join(format_number(x, precision=precision) for x in row))
420 def print_matrix(matrix, delimiter=' ', precision=2):
421 print('\n'.join(delimiter.join(format_number(x, precision=precision)
426 def varimax_criterion(A):
427 """Return the varimax criterion of a factor-loading matrix A
429 V(A) = 1/k ∑ⱼ (1/d ∑ᵢ Aᵢⱼ⁴ - (1/d ∑ᵢ Aᵢⱼ²)²)
430 = 1/k ∑ⱼ (<Aᵢⱼ⁴>ᵢ - <Aᵢⱼ²>ᵢ²)
433 return _numpy.var(A**2)
436 def pca(scores, ddof=0, output_dim=None):
437 """Principal component analysis via singular value decomposition
439 Return an array of eigenvalues (variance scaling) and an
440 orthogonal matrix of unitary eigenvectors (the factor loadings).
444 X = (X - X.mean(axis=0)) / X.std(axis=0, ddof=0) / _numpy.sqrt(d-ddof)
445 U,s,V = _numpy.linalg.svd(X, full_matrices=False)
447 G = U.dot(_numpy.diag(S))
448 ss = _numpy.sort(s)[::-1]
449 assert (s == ss).all(), (s, ss, s-ss)
450 if output_dim is None:
451 output_dim = V.T.shape[1]
452 eigenvalues = s**2 # see explained_variance.__doc__
453 return (eigenvalues, V.T[:,:output_dim])
456 def communalities(A):
457 """Communalities for each measured attribute
459 This how well the model fits each attribute. It's similar to the
460 coefficient of determination (R²) for linear regressions.
462 http://sites.stat.psu.edu/~ajw13/stat505/fa06/17_factor/07_factor_commun.html
464 return (A**2).sum(axis=1).T
468 """Extract the measured attribute variance from the loading matrix
470 Some loading matricies have the measured attribute variance still
471 built into the loading terms (like covariance), while some have
472 the variance scaled out of them (i.e. unit eigenvectors, like
473 correlations). If the variance is built in, this function find
476 return (A**2).sum(axis=0)
479 def sort_eigenvalues(A):
480 """Sort non-unitary eigenvalues in decreasing magnitude
482 evals = eigenvalues(A)
484 for i,j in enumerate(reversed(evals.argsort())):
489 def reduce_double_negatives(A):
490 """Flip eigenvalue sign to make largest component positive
492 for i in range(A.shape[1]):
493 if abs(A[:,i].min()) > A[:,i].max():
498 def explained_variance(eigenvalues, total=None):
499 """Fraction of explained variance for each measured attribute
501 The common factors have unit variance, so the (squared)
502 eigenvalues of the factors give the variance of the modeled
503 attributes. "Squared" is in parenthesis, because you need to
504 square the eigenvalues calculated using the score SVD (converting
505 standard deviations to variance), but you don't need to square
506 eigenvalues calculated from the covariance matrix.
509 total = eigenvalues.sum()
510 return eigenvalues / total
513 def varimax(A, max_iterations=None, epsilon=1e-7):
514 """Return the varimax-rotated version of a factor-loading matrix A
516 z = 1/d ∑ⱼ (Aⱼᵤ + iAⱼᵥ)⁴ - (1/d ∑ⱼ (Aⱼᵤ + iAⱼᵥ)²)²
521 if k == 1: # only one common factor, so there's nothing to rotate
523 if max_iterations is None:
524 if k == 2: # only one combination, so repetition is redundant
528 ovc = varimax_criterion(A)
529 for i in range(max_iterations):
530 for p,q in _itertools.combinations(range(k), 2):
531 Aj = _numpy.array([_numpy.complex(Ajp,Ajq)
532 for Ajp,Ajq in zip(A[:,p], A[:,q])])
533 z = od * (Aj**4).sum() - (od * (Aj**2).sum())**2
534 phi = 0.25 * _numpy.angle(z)
536 R[p,p] = R[q,q] = _numpy.cos(phi)
537 R[q,p] = _numpy.sin(phi)
540 vc = varimax_criterion(A)
541 if (vc - ovc) < epsilon * vc:
542 return A # converged :)
544 # didn't converge (if k != 2) :(
548 def analyze(scores, output_dim=3, unitary=False, delimiter='\t', missing=''):
550 masked_scores = _numpy.ma.array(scores, mask=_numpy.isnan(scores))
551 mu = masked_scores.mean(axis=0)
552 std = masked_scores.std(axis=0, ddof=output_dim)
553 masked_scores = (masked_scores - mu).filled(0)
554 evals,A = pca(masked_scores, output_dim=output_dim)
556 A *= _numpy.sqrt(evals[:output_dim])
559 A = sort_eigenvalues(A)
560 A = reduce_double_negatives(A)
561 print('#' + delimiter.join(
563 'eig', 'prop', 'r-eig', 'r-prop', 'com'
565 'f-{}'.format(i) for i in range(output_dim)
568 r_evals = eigenvalues(A)
569 explained = explained_variance(evals)
570 r_explained = explained_variance(r_evals, total=evals.sum())
571 com = communalities(A)
573 r_eval = r_exp = missing
576 r_exp = r_explained[i]
587 print_row(row, delimiter=delimiter, precision=3)
590 if __name__ == '__main__':
593 parser = argparse.ArgumentParser()
595 '-n', dest='output_dim', type=int, default=1,
596 help='number of common factors to use')
598 '-u', dest='unitary', default=False, action='store_const', const=True,
599 help='use unitary eigenvalues for factor loading')
602 help='path to the tab-delimited data file to analyze')
604 args = parser.parse_args()
606 scores = _numpy.genfromtxt(fname=args.path, delimiter='\t')
607 analyze(scores=scores, output_dim=args.output_dim, unitary=args.unitary)