[딥러닝] PCA

BACK END/Deep Learning

[딥러닝] PCA

circle kim 2021. 3. 16. 16:28

특성공학중 PCA(Principal Component Analysis)
: 특성을 단순히 선택하는 것이 아니라 특성들의 조합으로 새로운 특성을 생성

: PCA(주성분 분석)는 특성 추출(Feature Extraction) 기법에 속함

iris dataset으로 차원 축소 (4개의 열을 2(sepal, petal))

* pca_test.py

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.datasets import load_iris
plt.rc('font', family='malgun gothic')

iris = load_iris()
n = 10
x = iris.data[:n, :2] # sepal 자료로 패턴확인
print('차원 축소 전  x:\n', x, x.shape, type(x)) # (10, 2) <class 'numpy.ndarray'>
'''
 [[5.1 3.5]
 [4.9 3. ]
 [4.7 3.2]
 [4.6 3.1]
 [5.  3.6]
 [5.4 3.9]
 [4.6 3.4]
 [5.  3.4]
 [4.4 2.9]
 [4.9 3.1]]
'''
print(x.T)
# [[5.1 4.9 4.7 4.6 5.  5.4 4.6 5.  4.4 4.9]
#  [3.5 3.  3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1]]

from sklearn.datasets import load_iris

load_iris() : ndarray type의 iris dataset load.

# 시각화
plt.plot(x.T, 'o:')
plt.xticks(range(2), labels=['꽃받침 길이', '꽃받침 폭'])
plt.xlim(-0.5, 2)
plt.ylim(2.5, 6)
plt.title('iris 특성')
plt.legend(['표본{}'.format(i + 1) for i in range(n)])
plt.show()

# 시각화2 : 산포도
plt.figure(figsize=(8, 8))
df = pd.DataFrame(x)
ax = sns.scatterplot(df[0], df[1], data=df , marker='s', s = 100, color=".2")
for i in range(n):
    ax.text(x[i, 0] - 0.05, x[i, 1] + 0.03, '표본{}'.format(i + 1))
    
plt.xlabel('꽃받침 길이')
plt.ylabel('꽃받침 폭')
plt.title('iris 특성')
plt.show()

# PCA
pca1 = PCA(n_components = 1)
x_row = pca1.fit_transform(x) # 1차원 근사데이터를 반환. 비 지도 학습
print('x_row :\n', x_row, x_row.shape) # (10, 1)
'''
[[ 0.30270263]
 [-0.1990931 ]
 [-0.18962889]
 [-0.33097106]
 [ 0.30743473]
 [ 0.79976625]
 [-0.11185966]
 [ 0.16136046]
 [-0.61365539]
 [-0.12605597]]
'''

x2 = pca1.inverse_transform(x_row)
print('복귀 후 값:\n', x2, x2.shape) # (10, 2)
'''
 [[5.06676112 3.53108532]
 [4.7240094  3.1645881 ]
 [4.73047393 3.17150049]
 [4.63393012 3.06826822]
 [5.06999338 3.53454152]
 [5.40628057 3.89412635]
 [4.78359423 3.22830091]
 [4.97021731 3.42785306]
 [4.44084251 2.86180369]
 [4.77389743 3.21793233]]
'''
print(x_row[0]) # [0.30270263]
print(x2[0, :]) # [5.06676112 3.53108532]

# 시각화2 : 산포도 - 사용
df = pd.DataFrame(x)
ax = sns.scatterplot(df[0], df[1], data=df , marker='s', s = 100, color=".2")
for i in range(n):
    d = 0.03 if x[i, 1] > x2[i, 1] else -0.04
    ax.text(x[i, 0] - 0.05, x[i, 1] + 0.03, '표본{}'.format(i + 1))
    plt.plot([x[i, 0], x2[i, 0]], [x[i, 1], x2[i, 1]], "k--")
plt.plot(x2[:, 0], x2[:, 1], "o-", markersize=10, color="b")
plt.plot(x[:, 0].mean(), x[:, 1].mean(), markersize=10, marker="D")
plt.axvline(x[:, 0].mean(), c='r') # 세로선
plt.axhline(x[:, 1].mean(), c='r') # 가로선
plt.xlabel('꽃받침 길이')
plt.ylabel('꽃받침 폭')
plt.title('iris 특성')
plt.show()

x = iris.data
pca2 = PCA(n_components = 2)
x_row2 = pca2.fit_transform(x)
print('x_row2 :\n', x_row2, x_row2.shape)

x4 = pca2.inverse_transform(x_row2)
print('최초자료 :', x[0])         # 최초자료 : [5.1 3.5 1.4 0.2]
print('차원축소 :', x_row2[0])    # 차원축소 : [-2.68412563  0.31939725]
print('최초복귀 :', x4[0, :])     # 최초복귀 : [5.08303897 3.51741393 1.40321372 0.21353169]

print()
iris2 = pd.DataFrame(x_row2, columns=['sepal', 'petal'])
iris1 = pd.DataFrame(x, columns=['sepal_Length', 'sepal_width', 'petal_Length', 'petal_width'])
print(iris2.head(3)) # 차원 축소
'''
      sepal     petal
0 -2.684126  0.319397
1 -2.714142 -0.177001
2 -2.888991 -0.144949
'''
print(iris1.head(3)) # 본래 데이터
'''
   sepal_Length  sepal_width  petal_Length  petal_width
0           5.1          3.5           1.4          0.2
1           4.9          3.0           1.4          0.2
2           4.7          3.2           1.3          0.2
'''