Exercise 10.8
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
%matplotlib inline
# Import dataset
df = pd.read_csv('../data/USArrests.csv', index_col=0)
# Data overview
df.head()
Murder | Assault | UrbanPop | Rape | |
---|---|---|---|---|
Alabama | 13.2 | 236 | 58 | 21.2 |
Alaska | 10.0 | 263 | 48 | 44.5 |
Arizona | 8.1 | 294 | 80 | 31.0 |
Arkansas | 8.8 | 190 | 50 | 19.5 |
California | 9.0 | 276 | 91 | 40.6 |
# Scale data (standardize)
scl = StandardScaler()
df_scl = scl.fit_transform(df)
Note: We should standardize the data because scale is an issue in this exercise. The variance of the variable Assault is significantly larger than the variance of the remaining variables. Thus, if we perform PCA on the unscaled variables, then the first principal component loading vector will have a very large loading for Assault. This would take us to a misleading solution.
(a)
# PCA
pca = PCA()
pca.fit(df_scl)
PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)
# Proportion of variance explained
# This is equivalent to do sdev output of the prcomp() function in R.
pca.explained_variance_ratio_
array([ 0.62006039, 0.24744129, 0.0891408 , 0.04335752])
(b)
# Loadings of the principal components
# Rows are the loading vectors (see References)
pca.components_
array([[ 0.53589947, 0.58318363, 0.27819087, 0.54343209],
[ 0.41818087, 0.1879856 , -0.87280619, -0.16731864],
[-0.34123273, -0.26814843, -0.37801579, 0.81777791],
[ 0.6492278 , -0.74340748, 0.13387773, 0.08902432]])
# Centered and scaled variables
# In (a) we used centered ans scaled variables, so we should use the same data here.
df_scl
array([[ 1.25517927, 0.79078716, -0.52619514, -0.00345116],
[ 0.51301858, 1.11805959, -1.22406668, 2.50942392],
[ 0.07236067, 1.49381682, 1.00912225, 1.05346626],
[ 0.23470832, 0.23321191, -1.08449238, -0.18679398],
[ 0.28109336, 1.2756352 , 1.77678094, 2.08881393],
[ 0.02597562, 0.40290872, 0.86954794, 1.88390137],
[-1.04088037, -0.73648418, 0.79976079, -1.09272319],
[-0.43787481, 0.81502956, 0.45082502, -0.58583422],
[ 1.76541475, 1.99078607, 1.00912225, 1.1505301 ],
[ 2.22926518, 0.48775713, -0.38662083, 0.49265293],
[-0.57702994, -1.51224105, 1.21848371, -0.11129987],
[-1.20322802, -0.61527217, -0.80534376, -0.75839217],
[ 0.60578867, 0.94836277, 1.21848371, 0.29852525],
[-0.13637203, -0.70012057, -0.03768506, -0.0250209 ],
[-1.29599811, -1.39102904, -0.5959823 , -1.07115345],
[-0.41468229, -0.67587817, 0.03210209, -0.34856705],
[ 0.44344101, -0.74860538, -0.94491807, -0.53190987],
[ 1.76541475, 0.94836277, 0.03210209, 0.10439756],
[-1.31919063, -1.06375661, -1.01470522, -1.44862395],
[ 0.81452136, 1.56654403, 0.10188925, 0.70835037],
[-0.78576263, -0.26375734, 1.35805802, -0.53190987],
[ 1.00006153, 1.02108998, 0.59039932, 1.49564599],
[-1.1800355 , -1.19708982, 0.03210209, -0.68289807],
[ 1.9277624 , 1.06957478, -1.5032153 , -0.44563089],
[ 0.28109336, 0.0877575 , 0.31125071, 0.75148985],
[-0.41468229, -0.74860538, -0.87513091, -0.521125 ],
[-0.80895515, -0.83345379, -0.24704653, -0.51034012],
[ 1.02325405, 0.98472638, 1.0789094 , 2.671197 ],
[-1.31919063, -1.37890783, -0.66576945, -1.26528114],
[-0.08998698, -0.14254532, 1.63720664, -0.26228808],
[ 0.83771388, 1.38472601, 0.31125071, 1.17209984],
[ 0.76813632, 1.00896878, 1.42784517, 0.52500755],
[ 1.20879423, 2.01502847, -1.43342815, -0.55347961],
[-1.62069341, -1.52436225, -1.5032153 , -1.50254831],
[-0.11317951, -0.61527217, 0.66018648, 0.01811858],
[-0.27552716, -0.23951493, 0.1716764 , -0.13286962],
[-0.66980002, -0.14254532, 0.10188925, 0.87012344],
[-0.34510472, -0.78496898, 0.45082502, -0.68289807],
[-1.01768785, 0.03927269, 1.49763233, -1.39469959],
[ 1.53348953, 1.3119988 , -1.22406668, 0.13675217],
[-0.92491776, -1.027393 , -1.43342815, -0.90938037],
[ 1.25517927, 0.20896951, -0.45640799, 0.61128652],
[ 1.13921666, 0.36654512, 1.00912225, 0.46029832],
[-1.06407289, -0.61527217, 1.00912225, 0.17989166],
[-1.29599811, -1.48799864, -2.34066115, -1.08193832],
[ 0.16513075, -0.17890893, -0.17725937, -0.05737552],
[-0.87853272, -0.31224214, 0.52061217, 0.53579242],
[-0.48425985, -1.08799901, -1.85215107, -1.28685088],
[-1.20322802, -1.42739264, 0.03210209, -1.1250778 ],
[-0.22914211, -0.11830292, -0.38662083, -0.60740397]])
# Application of Equation 10.8
for k in range(0,np.shape(pca.components_)[1]):
# Numerator
accum = 0
num = 0
for i in range(0, np.shape(df_scl)[0]):
for j in range(0, np.shape(df_scl)[1]):
accum += pca.components_[k][j] * df_scl[i][j]
num += accum**2
accum = 0
# Denominator
accum = 0
den = 0
for j in range(0, np.shape(df_scl)[1]):
for i in range(0, np.shape(df_scl)[0]):
accum += df_scl[i][j]**2
den += accum
accum = 0
# Result
print('principal component number:', k+1)
print(num/den)
principal component number: 1
0.620060394787
principal component number: 2
0.247441288135
principal component number: 3
0.0891407951452
principal component number: 4
0.0433575219325
References
- http://stackoverflow.com/questions/21217710/factor-loadings-using-sklearn (loading vectors)