코딩일지(2024-01-03)
PCA 알고리즘 순서도
import numpy as np
from sklearn.datasets import load_iris
iris = load_iris().data.copy()
print('iris: \n', iris[:10,:])
iris:
[[5.1 3.5 1.4 0.2]
[4.9 3. 1.4 0.2]
[4.7 3.2 1.3 0.2]
[4.6 3.1 1.5 0.2]
[5. 3.6 1.4 0.2]
[5.4 3.9 1.7 0.4]
[4.6 3.4 1.4 0.3]
[5. 3.4 1.5 0.2]
[4.4 2.9 1.4 0.2]
[4.9 3.1 1.5 0.1]]
1.데이터의 집합을 평균 중심화(mean centering)한다.
m = iris.mean(axis=0)
ir = iris - m
print('m: ',m)
print('ir: \n', ir[:10, :])
m: [5.84333333 3.05733333 3.758 1.19933333]
ir:
[[-0.74333333 0.44266667 -2.358 -0.99933333]
[-0.94333333 -0.05733333 -2.358 -0.99933333]
[-1.14333333 0.14266667 -2.458 -0.99933333]
[-1.24333333 0.04266667 -2.258 -0.99933333]
[-0.84333333 0.54266667 -2.358 -0.99933333]
[-0.44333333 0.84266667 -2.058 -0.79933333]
[-1.24333333 0.34266667 -2.358 -0.89933333]
[-0.84333333 0.34266667 -2.258 -0.99933333]
[-1.44333333 -0.15733333 -2.358 -0.99933333]
[-0.94333333 0.04266667 -2.258 -1.09933333]]
2.평균 중심화된 데이터의 공분산 행렬-시그마를 구한다.
cv = np.cov(ir, rowvar=False)
print("cv: \n", cv)
cv:
[[ 0.68569351 -0.042434 1.27431544 0.51627069]
[-0.042434 0.18997942 -0.32965638 -0.12163937]
[ 1.27431544 -0.32965638 3.11627785 1.2956094 ]
[ 0.51627069 -0.12163937 1.2956094 0.58100626]]
3.시그마의 고윳값들과 고유벡터들을 구한다.
val, vec = np.linalg.eig(cv)
val = np.abs(val)
print('val: \n', val)
val:
[4.22824171 0.24267075 0.0782095 0.02383509]
4.고윳값들을 절댓값 내림차순으로 정렬한다.
idx = np.argsort(val)[::-1]
ex = val[idx] / val.sum()
print("fraction explained: \n", ex)
fraction explained:
[0.92461872 0.05306648 0.01710261 0.00521218]
5.가장 ‘약한(작은)’ 고윳값 및 고유벡터를 폐기한다(생략가능).
w = np.vstack((vec[:,idx[0]],vec[:,idx[1]]))
print("w: \n", w)
w:
[[ 0.36138659 -0.08452251 0.85667061 0.3582892 ]
[-0.65658877 -0.73016143 0.17337266 0.07548102]]
6.나머지 고유벡터들로 변환 행렬 W를 만든다.
d = np.zeros((ir.shape[0],2))
for i in range(ir.shape[0]):
d[i,:] = np.dot(w,ir[i])
peopleInfo를 이용한 PCA 차원 축소 알고리즘 구현 예시
import pandas as pd
1.데이터 예시
# Eating, exercise habbit and their body shape
df = pd.DataFrame(columns=['calory', 'breakfast', 'lunch', 'dinner', 'exercise', 'body_shape'])
df.loc[0] = [1200, 1, 0, 0, 2, 'Skinny']
df.loc[1] = [2800, 1, 1, 1, 1, 'Normal']
df.loc[2] = [3500, 2, 2, 1, 0, 'Fat']
df.loc[3] = [1400, 0, 1, 0, 3, 'Skinny']
df.loc[4] = [5000, 2, 2, 2, 0, 'Fat']
df.loc[5] = [1300, 0, 0, 1, 2, 'Skinny']
df.loc[6] = [3000, 1, 0, 1, 1, 'Normal']
df.loc[7] = [4000, 2, 2, 2, 0, 'Fat']
df.loc[8] = [2600, 0, 2, 0, 0, 'Normal']
df.loc[9] = [3000, 1, 2, 1, 1, 'Fat']
df.head(10)
calory | breakfast | lunch | dinner | exercise | body_shape | |
---|---|---|---|---|---|---|
0 | 1200 | 1 | 0 | 0 | 2 | Skinny |
1 | 2800 | 1 | 1 | 1 | 1 | Normal |
2 | 3500 | 2 | 2 | 1 | 0 | Fat |
3 | 1400 | 0 | 1 | 0 | 3 | Skinny |
4 | 5000 | 2 | 2 | 2 | 0 | Fat |
5 | 1300 | 0 | 0 | 1 | 2 | Skinny |
6 | 3000 | 1 | 0 | 1 | 1 | Normal |
7 | 4000 | 2 | 2 | 2 | 0 | Fat |
8 | 2600 | 0 | 2 | 0 | 0 | Normal |
9 | 3000 | 1 | 2 | 1 | 1 | Fat |
2. 특징(피쳐) 벡터 및 라벨 분할
# X is feature vectors
X = df[['calory', 'breakfast', 'lunch', 'dinner', 'exercise']]
X.head(9)
calory | breakfast | lunch | dinner | exercise | |
---|---|---|---|---|---|
0 | 1200 | 1 | 0 | 0 | 2 |
1 | 2800 | 1 | 1 | 1 | 1 |
2 | 3500 | 2 | 2 | 1 | 0 |
3 | 1400 | 0 | 1 | 0 | 3 |
4 | 5000 | 2 | 2 | 2 | 0 |
5 | 1300 | 0 | 0 | 1 | 2 |
6 | 3000 | 1 | 0 | 1 | 1 |
7 | 4000 | 2 | 2 | 2 | 0 |
8 | 2600 | 0 | 2 | 0 | 0 |
# Y is labels
Y = df[['body_shape']]
Y.head(10)
body_shape | |
---|---|
0 | Skinny |
1 | Normal |
2 | Fat |
3 | Skinny |
4 | Fat |
5 | Skinny |
6 | Normal |
7 | Fat |
8 | Normal |
9 | Fat |
3. 특징 벡터의 크기를 모두 동일하게 변경
from sklearn.preprocessing import StandardScaler
x_std = StandardScaler().fit_transform(X)
x_std
array([[-1.35205803, 0. , -1.3764944 , -1.28571429, 1. ],
[ 0.01711466, 0. , -0.22941573, 0.14285714, 0. ],
[ 0.61612771, 1.29099445, 0.91766294, 0.14285714, -1. ],
[-1.18091145, -1.29099445, -0.22941573, -1.28571429, 2. ],
[ 1.89972711, 1.29099445, 0.91766294, 1.57142857, -1. ],
[-1.26648474, -1.29099445, -1.3764944 , 0.14285714, 1. ],
[ 0.18826125, 0. , -1.3764944 , 0.14285714, 0. ],
[ 1.04399418, 1.29099445, 0.91766294, 1.57142857, -1. ],
[-0.15403193, -1.29099445, 0.91766294, -1.28571429, -1. ],
[ 0.18826125, 0. , 0.91766294, 0.14285714, 0. ]])
4. 특징(피쳐)의 공분산 행렬
import numpy as np
# features are columns from x_std
features = x_std.T
covariance_matrix = np.cov(features)
print(covariance_matrix)
[[ 1.11111111 0.88379717 0.76782385 0.89376551 -0.93179808]
[ 0.88379717 1.11111111 0.49362406 0.81967902 -0.71721914]
[ 0.76782385 0.49362406 1.11111111 0.40056715 -0.76471911]
[ 0.89376551 0.81967902 0.40056715 1.11111111 -0.63492063]
[-0.93179808 -0.71721914 -0.76471911 -0.63492063 1.11111111]]
5. 공분산 행렬의 고유 벡터 및 고유 값
eig_vals, eig_vecs = np.linalg.eig(covariance_matrix)
print('Eigenvectors \n',eig_vecs)
Eigenvectors
[[-0.508005 -0.0169937 -0.84711404 0.11637853 0.10244985]
[-0.44660335 -0.36890361 0.12808055 -0.63112016 -0.49973822]
[-0.38377913 0.70804084 0.20681005 -0.40305226 0.38232213]
[-0.42845209 -0.53194699 0.3694462 0.22228235 0.58954327]
[ 0.46002038 -0.2816592 -0.29450345 -0.61341895 0.49601841]]
print('Eigenvalues \n', eig_vals)
Eigenvalues
[4.0657343 0.8387565 0.07629538 0.27758568 0.2971837 ]
# We reduce dimension to 1 dimension, since 1 eigenvector has 73% (enough) variances
eig_vals[0] / sum(eig_vals)
0.7318321731427542
6. 선택된 고유벡터에 데이터 지점을 투영
projected_X = x_std.dot(eig_vecs.T[0])
projected_X
array([ 2.22600943, 0.0181432 , -1.76296611, 2.73542407, -3.02711544,
2.14702579, 0.37142473, -2.59239883, 0.39347815, -0.50902498])
result = pd.DataFrame(projected_X, columns=['PC1'])
result['y-axis'] = 0.0
result['label'] = Y
result.head(10)
PC1 | y-axis | label | |
---|---|---|---|
0 | 2.226009 | 0.0 | Skinny |
1 | 0.018143 | 0.0 | Normal |
2 | -1.762966 | 0.0 | Fat |
3 | 2.735424 | 0.0 | Skinny |
4 | -3.027115 | 0.0 | Fat |
5 | 2.147026 | 0.0 | Skinny |
6 | 0.371425 | 0.0 | Normal |
7 | -2.592399 | 0.0 | Fat |
8 | 0.393478 | 0.0 | Normal |
9 | -0.509025 | 0.0 | Fat |
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.lmplot(x='PC1', y='y-axis', data=result, fit_reg=False, # x-axis, y-axis, data, no line
scatter_kws={"s": 50}, # marker size
hue="label") # color
# title
plt.title('PCA result')
Text(0.5, 1.0, 'PCA result')
※ 추가 - 사이킷런 PCA
from sklearn import decomposition
pca = decomposition.PCA(n_components=1)
sklearn_pca_x = pca.fit_transform(x_std)
sklearn_result = pd.DataFrame(sklearn_pca_x, columns=['PC1'])
sklearn_result['y-axis'] = 0.0
sklearn_result['label'] = Y
sns.lmplot(x='PC1', y='y-axis', data=sklearn_result, fit_reg=False, # x-axis, y-axis, data, no line
scatter_kws={"s": 50}, # marker size
hue="label") # color
<seaborn.axisgrid.FacetGrid at 0x14de7824f90>
출처: https://github.com/minsuk-heo/python_tutorial/blob/master/data_science/pca/PCA.ipynb
SVD를 이용하여 차원축소 해보기
import numpy as np
1.원본 행렬 생성
A = np.array([[1,2,3],[4,5,6],[7,8,9]])
2.SVD 수행
U, S, Vt = np.linalg.svd(A)
3.대각 행렬 생성
Sigma = np.diag(S)
4.특이값 분해 결과 출력
print("Original Martrix A: \n")
print(A)
Original Martrix A:
[[1 2 3]
[4 5 6]
[7 8 9]]
print("SVD Results: \n")
print(U)
SVD Results:
[[-0.21483724 0.88723069 0.40824829]
[-0.52058739 0.24964395 -0.81649658]
[-0.82633754 -0.38794278 0.40824829]]
print("Sigma matrix: \n")
print(Sigma)
Sigma matrix:
[[1.68481034e+01 0.00000000e+00 0.00000000e+00]
[0.00000000e+00 1.06836951e+00 0.00000000e+00]
[0.00000000e+00 0.00000000e+00 4.41842475e-16]]
print("Vt matrix: \n")
print(Vt)
Vt matrix:
[[-0.47967118 -0.57236779 -0.66506441]
[-0.77669099 -0.07568647 0.62531805]
[-0.40824829 0.81649658 -0.40824829]]
5.행렬 근사화: 상위 k개의 특이값만 사용하여 근사 행렬 계산
k = 2
Ak = U[:, :k] @ Sigma[:k, :k] @ Vt[:k, :]
6.행렬 근사화 결과 출력
print("Matrix Approximation with top", k, "singular values: \n")
print(Ak)
Matrix Approximation with top 2 singular values:
[[1. 2. 3.]
[4. 5. 6.]
[7. 8. 9.]]
댓글남기기