4 분 소요

PCA 알고리즘 순서도

import numpy as np
from sklearn.datasets import load_iris
iris = load_iris().data.copy()
print('iris: \n', iris[:10,:])
iris: 
 [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]]

1.데이터의 집합을 평균 중심화(mean centering)한다.

m = iris.mean(axis=0)
ir = iris - m
print('m: ',m)
print('ir: \n', ir[:10, :])
m:  [5.84333333 3.05733333 3.758      1.19933333]
ir: 
 [[-0.74333333  0.44266667 -2.358      -0.99933333]
 [-0.94333333 -0.05733333 -2.358      -0.99933333]
 [-1.14333333  0.14266667 -2.458      -0.99933333]
 [-1.24333333  0.04266667 -2.258      -0.99933333]
 [-0.84333333  0.54266667 -2.358      -0.99933333]
 [-0.44333333  0.84266667 -2.058      -0.79933333]
 [-1.24333333  0.34266667 -2.358      -0.89933333]
 [-0.84333333  0.34266667 -2.258      -0.99933333]
 [-1.44333333 -0.15733333 -2.358      -0.99933333]
 [-0.94333333  0.04266667 -2.258      -1.09933333]]

2.평균 중심화된 데이터의 공분산 행렬-시그마를 구한다.

cv = np.cov(ir, rowvar=False)
print("cv: \n", cv)
cv: 
 [[ 0.68569351 -0.042434    1.27431544  0.51627069]
 [-0.042434    0.18997942 -0.32965638 -0.12163937]
 [ 1.27431544 -0.32965638  3.11627785  1.2956094 ]
 [ 0.51627069 -0.12163937  1.2956094   0.58100626]]

3.시그마의 고윳값들과 고유벡터들을 구한다.

val, vec = np.linalg.eig(cv)
val = np.abs(val)
print('val: \n', val)
val: 
 [4.22824171 0.24267075 0.0782095  0.02383509]

4.고윳값들을 절댓값 내림차순으로 정렬한다.

idx = np.argsort(val)[::-1]
ex = val[idx] / val.sum()
print("fraction explained: \n", ex)
fraction explained: 
 [0.92461872 0.05306648 0.01710261 0.00521218]

5.가장 ‘약한(작은)’ 고윳값 및 고유벡터를 폐기한다(생략가능).

w = np.vstack((vec[:,idx[0]],vec[:,idx[1]]))
print("w: \n", w)
w: 
 [[ 0.36138659 -0.08452251  0.85667061  0.3582892 ]
 [-0.65658877 -0.73016143  0.17337266  0.07548102]]

6.나머지 고유벡터들로 변환 행렬 W를 만든다.

d = np.zeros((ir.shape[0],2))
for i in range(ir.shape[0]):
    d[i,:] = np.dot(w,ir[i])


peopleInfo를 이용한 PCA 차원 축소 알고리즘 구현 예시

import pandas as pd

1.데이터 예시

# Eating, exercise habbit and their body shape
df = pd.DataFrame(columns=['calory', 'breakfast', 'lunch', 'dinner', 'exercise', 'body_shape'])
df.loc[0] = [1200, 1, 0, 0, 2, 'Skinny']
df.loc[1] = [2800, 1, 1, 1, 1, 'Normal']
df.loc[2] = [3500, 2, 2, 1, 0, 'Fat']
df.loc[3] = [1400, 0, 1, 0, 3, 'Skinny']
df.loc[4] = [5000, 2, 2, 2, 0, 'Fat']
df.loc[5] = [1300, 0, 0, 1, 2, 'Skinny']
df.loc[6] = [3000, 1, 0, 1, 1, 'Normal']
df.loc[7] = [4000, 2, 2, 2, 0, 'Fat']
df.loc[8] = [2600, 0, 2, 0, 0, 'Normal']
df.loc[9] = [3000, 1, 2, 1, 1, 'Fat']
df.head(10)
calory breakfast lunch dinner exercise body_shape
0 1200 1 0 0 2 Skinny
1 2800 1 1 1 1 Normal
2 3500 2 2 1 0 Fat
3 1400 0 1 0 3 Skinny
4 5000 2 2 2 0 Fat
5 1300 0 0 1 2 Skinny
6 3000 1 0 1 1 Normal
7 4000 2 2 2 0 Fat
8 2600 0 2 0 0 Normal
9 3000 1 2 1 1 Fat

2. 특징(피쳐) 벡터 및 라벨 분할

# X is feature vectors
X = df[['calory', 'breakfast', 'lunch', 'dinner', 'exercise']]
X.head(9)
calory breakfast lunch dinner exercise
0 1200 1 0 0 2
1 2800 1 1 1 1
2 3500 2 2 1 0
3 1400 0 1 0 3
4 5000 2 2 2 0
5 1300 0 0 1 2
6 3000 1 0 1 1
7 4000 2 2 2 0
8 2600 0 2 0 0
# Y is labels
Y = df[['body_shape']]
Y.head(10)
body_shape
0 Skinny
1 Normal
2 Fat
3 Skinny
4 Fat
5 Skinny
6 Normal
7 Fat
8 Normal
9 Fat

3. 특징 벡터의 크기를 모두 동일하게 변경

from sklearn.preprocessing import StandardScaler
x_std = StandardScaler().fit_transform(X)
x_std
array([[-1.35205803,  0.        , -1.3764944 , -1.28571429,  1.        ],
       [ 0.01711466,  0.        , -0.22941573,  0.14285714,  0.        ],
       [ 0.61612771,  1.29099445,  0.91766294,  0.14285714, -1.        ],
       [-1.18091145, -1.29099445, -0.22941573, -1.28571429,  2.        ],
       [ 1.89972711,  1.29099445,  0.91766294,  1.57142857, -1.        ],
       [-1.26648474, -1.29099445, -1.3764944 ,  0.14285714,  1.        ],
       [ 0.18826125,  0.        , -1.3764944 ,  0.14285714,  0.        ],
       [ 1.04399418,  1.29099445,  0.91766294,  1.57142857, -1.        ],
       [-0.15403193, -1.29099445,  0.91766294, -1.28571429, -1.        ],
       [ 0.18826125,  0.        ,  0.91766294,  0.14285714,  0.        ]])

4. 특징(피쳐)의 공분산 행렬

import numpy as np
# features are columns from x_std
features = x_std.T 
covariance_matrix = np.cov(features)
print(covariance_matrix)
[[ 1.11111111  0.88379717  0.76782385  0.89376551 -0.93179808]
 [ 0.88379717  1.11111111  0.49362406  0.81967902 -0.71721914]
 [ 0.76782385  0.49362406  1.11111111  0.40056715 -0.76471911]
 [ 0.89376551  0.81967902  0.40056715  1.11111111 -0.63492063]
 [-0.93179808 -0.71721914 -0.76471911 -0.63492063  1.11111111]]

5. 공분산 행렬의 고유 벡터 및 고유 값

eig_vals, eig_vecs = np.linalg.eig(covariance_matrix)
print('Eigenvectors \n',eig_vecs)
Eigenvectors 
 [[-0.508005   -0.0169937  -0.84711404  0.11637853  0.10244985]
 [-0.44660335 -0.36890361  0.12808055 -0.63112016 -0.49973822]
 [-0.38377913  0.70804084  0.20681005 -0.40305226  0.38232213]
 [-0.42845209 -0.53194699  0.3694462   0.22228235  0.58954327]
 [ 0.46002038 -0.2816592  -0.29450345 -0.61341895  0.49601841]]
print('Eigenvalues \n', eig_vals)
Eigenvalues 
 [4.0657343  0.8387565  0.07629538 0.27758568 0.2971837 ]
# We reduce dimension to 1 dimension, since 1 eigenvector has 73% (enough) variances
eig_vals[0] / sum(eig_vals)
0.7318321731427542

6. 선택된 고유벡터에 데이터 지점을 투영

projected_X = x_std.dot(eig_vecs.T[0])
projected_X
array([ 2.22600943,  0.0181432 , -1.76296611,  2.73542407, -3.02711544,
        2.14702579,  0.37142473, -2.59239883,  0.39347815, -0.50902498])
result = pd.DataFrame(projected_X, columns=['PC1'])
result['y-axis'] = 0.0
result['label'] = Y
result.head(10)
PC1 y-axis label
0 2.226009 0.0 Skinny
1 0.018143 0.0 Normal
2 -1.762966 0.0 Fat
3 2.735424 0.0 Skinny
4 -3.027115 0.0 Fat
5 2.147026 0.0 Skinny
6 0.371425 0.0 Normal
7 -2.592399 0.0 Fat
8 0.393478 0.0 Normal
9 -0.509025 0.0 Fat
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.lmplot(x='PC1', y='y-axis', data=result, fit_reg=False,  # x-axis, y-axis, data, no line
           scatter_kws={"s": 50}, # marker size
           hue="label") # color

# title
plt.title('PCA result')
Text(0.5, 1.0, 'PCA result')

output_27_1

※ 추가 - 사이킷런 PCA

from sklearn import decomposition
pca = decomposition.PCA(n_components=1)
sklearn_pca_x = pca.fit_transform(x_std)
sklearn_result = pd.DataFrame(sklearn_pca_x, columns=['PC1'])
sklearn_result['y-axis'] = 0.0
sklearn_result['label'] = Y

sns.lmplot(x='PC1', y='y-axis', data=sklearn_result, fit_reg=False,  # x-axis, y-axis, data, no line
           scatter_kws={"s": 50}, # marker size
           hue="label") # color
<seaborn.axisgrid.FacetGrid at 0x14de7824f90>

output_30_1

출처: https://github.com/minsuk-heo/python_tutorial/blob/master/data_science/pca/PCA.ipynb



SVD를 이용하여 차원축소 해보기

import numpy as np

1.원본 행렬 생성

A = np.array([[1,2,3],[4,5,6],[7,8,9]])

2.SVD 수행

U, S, Vt = np.linalg.svd(A)

3.대각 행렬 생성

Sigma = np.diag(S)

4.특이값 분해 결과 출력

print("Original Martrix A: \n")
print(A)
Original Martrix A: 

[[1 2 3]
 [4 5 6]
 [7 8 9]]
print("SVD Results: \n")
print(U)
SVD Results: 

[[-0.21483724  0.88723069  0.40824829]
 [-0.52058739  0.24964395 -0.81649658]
 [-0.82633754 -0.38794278  0.40824829]]
print("Sigma matrix: \n")
print(Sigma)
Sigma matrix: 

[[1.68481034e+01 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 1.06836951e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 4.41842475e-16]]
print("Vt matrix: \n")
print(Vt)
Vt matrix: 

[[-0.47967118 -0.57236779 -0.66506441]
 [-0.77669099 -0.07568647  0.62531805]
 [-0.40824829  0.81649658 -0.40824829]]

5.행렬 근사화: 상위 k개의 특이값만 사용하여 근사 행렬 계산

k = 2
Ak = U[:, :k] @ Sigma[:k, :k] @ Vt[:k, :]

6.행렬 근사화 결과 출력

print("Matrix Approximation with top", k, "singular values: \n")
print(Ak)
Matrix Approximation with top 2 singular values: 

[[1. 2. 3.]
 [4. 5. 6.]
 [7. 8. 9.]]


다양한 배열 선언 초기화 구문(출제예정)

image-20240103093848306

댓글남기기