BACK END/Deep Learning
[딥러닝] PCA
circle kim
2021. 3. 16. 16:28
특성공학중 PCA(Principal Component Analysis)
: 특성을 단순히 선택하는 것이 아니라 특성들의 조합으로 새로운 특성을 생성
: PCA(주성분 분석)는 특성 추출(Feature Extraction) 기법에 속함
iris dataset으로 차원 축소 (4개의 열을 2(sepal, petal))
* pca_test.py
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.datasets import load_iris
plt.rc('font', family='malgun gothic')
iris = load_iris()
n = 10
x = iris.data[:n, :2] # sepal 자료로 패턴확인
print('차원 축소 전 x:\n', x, x.shape, type(x)) # (10, 2) <class 'numpy.ndarray'>
'''
[[5.1 3.5]
[4.9 3. ]
[4.7 3.2]
[4.6 3.1]
[5. 3.6]
[5.4 3.9]
[4.6 3.4]
[5. 3.4]
[4.4 2.9]
[4.9 3.1]]
'''
print(x.T)
# [[5.1 4.9 4.7 4.6 5. 5.4 4.6 5. 4.4 4.9]
# [3.5 3. 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1]]
from sklearn.datasets import load_iris
load_iris() : ndarray type의 iris dataset load.
# 시각화
plt.plot(x.T, 'o:')
plt.xticks(range(2), labels=['꽃받침 길이', '꽃받침 폭'])
plt.xlim(-0.5, 2)
plt.ylim(2.5, 6)
plt.title('iris 특성')
plt.legend(['표본{}'.format(i + 1) for i in range(n)])
plt.show()
# 시각화2 : 산포도
plt.figure(figsize=(8, 8))
df = pd.DataFrame(x)
ax = sns.scatterplot(df[0], df[1], data=df , marker='s', s = 100, color=".2")
for i in range(n):
ax.text(x[i, 0] - 0.05, x[i, 1] + 0.03, '표본{}'.format(i + 1))
plt.xlabel('꽃받침 길이')
plt.ylabel('꽃받침 폭')
plt.title('iris 특성')
plt.show()
# PCA
pca1 = PCA(n_components = 1)
x_row = pca1.fit_transform(x) # 1차원 근사데이터를 반환. 비 지도 학습
print('x_row :\n', x_row, x_row.shape) # (10, 1)
'''
[[ 0.30270263]
[-0.1990931 ]
[-0.18962889]
[-0.33097106]
[ 0.30743473]
[ 0.79976625]
[-0.11185966]
[ 0.16136046]
[-0.61365539]
[-0.12605597]]
'''
x2 = pca1.inverse_transform(x_row)
print('복귀 후 값:\n', x2, x2.shape) # (10, 2)
'''
[[5.06676112 3.53108532]
[4.7240094 3.1645881 ]
[4.73047393 3.17150049]
[4.63393012 3.06826822]
[5.06999338 3.53454152]
[5.40628057 3.89412635]
[4.78359423 3.22830091]
[4.97021731 3.42785306]
[4.44084251 2.86180369]
[4.77389743 3.21793233]]
'''
print(x_row[0]) # [0.30270263]
print(x2[0, :]) # [5.06676112 3.53108532]
# 시각화2 : 산포도 - 사용
df = pd.DataFrame(x)
ax = sns.scatterplot(df[0], df[1], data=df , marker='s', s = 100, color=".2")
for i in range(n):
d = 0.03 if x[i, 1] > x2[i, 1] else -0.04
ax.text(x[i, 0] - 0.05, x[i, 1] + 0.03, '표본{}'.format(i + 1))
plt.plot([x[i, 0], x2[i, 0]], [x[i, 1], x2[i, 1]], "k--")
plt.plot(x2[:, 0], x2[:, 1], "o-", markersize=10, color="b")
plt.plot(x[:, 0].mean(), x[:, 1].mean(), markersize=10, marker="D")
plt.axvline(x[:, 0].mean(), c='r') # 세로선
plt.axhline(x[:, 1].mean(), c='r') # 가로선
plt.xlabel('꽃받침 길이')
plt.ylabel('꽃받침 폭')
plt.title('iris 특성')
plt.show()
x = iris.data
pca2 = PCA(n_components = 2)
x_row2 = pca2.fit_transform(x)
print('x_row2 :\n', x_row2, x_row2.shape)
x4 = pca2.inverse_transform(x_row2)
print('최초자료 :', x[0]) # 최초자료 : [5.1 3.5 1.4 0.2]
print('차원축소 :', x_row2[0]) # 차원축소 : [-2.68412563 0.31939725]
print('최초복귀 :', x4[0, :]) # 최초복귀 : [5.08303897 3.51741393 1.40321372 0.21353169]
print()
iris2 = pd.DataFrame(x_row2, columns=['sepal', 'petal'])
iris1 = pd.DataFrame(x, columns=['sepal_Length', 'sepal_width', 'petal_Length', 'petal_width'])
print(iris2.head(3)) # 차원 축소
'''
sepal petal
0 -2.684126 0.319397
1 -2.714142 -0.177001
2 -2.888991 -0.144949
'''
print(iris1.head(3)) # 본래 데이터
'''
sepal_Length sepal_width petal_Length petal_width
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
'''