[딥러닝] SVM

2021. 3. 16. 12:23

SVM(Support Vector Machine)

: 두 데이터 사이에 구분을 위해 사용

: 각 데이터의 중심을 기준으로 초평면(Optimal Hyper Plane) 구한다.

: 초평면과 가까운 데이터를 support vector라 한다.

: XOR 처리 가능

XOR 연산 처리(분류)

* svm1.py

xor_data = [
    [0,0,0],
    [0,1,1],
    [1,0,1],
    [1,1,0],
]
#print(xor_data)

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import svm

xor_df = pd.DataFrame(xor_data)

feature = np.array(xor_df.iloc[:, 0:2])
label = np.array(xor_df.iloc[:, 2])
print(feature)
'''
[[0 0]
 [0 1]
 [1 0]
 [1 1]]
'''
print(label) # [0 1 1 0]

model = LogisticRegression() # 선형분류 모델
model.fit(feature, label)
pred = model.predict(feature)
print('pred :', pred)
# pred : [0 0 0 0]

model = svm.SVC()             # 선형, 비선형(kernel trick 사용) 분류모델
model.fit(feature, label)
pred = model.predict(feature)
print('pred :', pred)
# pred : [0 1 1 0]

# Sopport vector 확인해보기 
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
import numpy as np

plt.rc('font', family='malgun gothic')

X, y = make_blobs(n_samples=50, centers=2, cluster_std=0.5, random_state=4)
y = 2 * y - 1

plt.scatter(X[y == -1, 0], X[y == -1, 1], marker='o', label="-1 클래스")
plt.scatter(X[y == +1, 0], X[y == +1, 1], marker='x', label="+1 클래스")
plt.xlabel("x1")
plt.ylabel("x2")
plt.legend()
plt.title("학습용 데이터")
plt.show()

from sklearn.svm import SVC
model = SVC(kernel='linear', C=1.0).fit(X, y)  # tuning parameter  값을 변경해보자.

xmin = X[:, 0].min()
xmax = X[:, 0].max()
ymin = X[:, 1].min()
ymax = X[:, 1].max()
xx = np.linspace(xmin, xmax, 10)
yy = np.linspace(ymin, ymax, 10)
X1, X2 = np.meshgrid(xx, yy)

z = np.empty(X1.shape)
for (i, j), val in np.ndenumerate(X1):    # 배열 좌표와 값 쌍을 생성하는 반복기를 반환
    x1 = val
    x2 = X2[i, j]
    p = model.decision_function([[x1, x2]])
    z[i, j] = p[0]

levels = [-1, 0, 1]
linestyles = ['dashed', 'solid', 'dashed']
plt.scatter(X[y == -1, 0], X[y == -1, 1], marker='o', label="-1 클래스")
plt.scatter(X[y == +1, 0], X[y == +1, 1], marker='x', label="+1 클래스")
plt.contour(X1, X2, z, levels, colors='k', linestyles=linestyles)
plt.scatter(model.support_vectors_[:, 0], model.support_vectors_[:, 1], s=300, alpha=0.3)

x_new = [10, 2]
plt.scatter(x_new[0], x_new[1], marker='^', s=100)
plt.text(x_new[0] + 0.03, x_new[1] + 0.08, "테스트 데이터")

plt.xlabel("x1")
plt.ylabel("x2")
plt.legend()
plt.title("SVM 예측 결과")
plt.show()

# Support Vectors 값 출력
print(model.support_vectors_)
'''
[[9.03715314 1.71813465]
 [9.17124955 3.52485535]]
'''

* svm2_iris.py

BMI의 계산방법을 이용하여 많은 양의 자료를 생성한 후 분류 모델로 처리

계산식    신체질량지수(BMI)=체중(kg)/[신장(m)]2
판정기준    저체중    20 미만
정상    20 - 24
과체중    25 - 29
비만    30 이상

* svm3_bmi.py

print(67/((170 / 100) * (170 / 100)))

import random

def calc_bmi(h,w):
    bmi = w / (h / 100)**2
    if bmi < 18.5: return 'thin'
    if bmi < 23: return 'normal'
    return 'fat'
print(calc_bmi(170, 65))

fp = open('bmi.csv', 'w')
fp.write('height, weight, label\n')

cnt = {'thin':0, 'normal':0, 'fat':0}

for i in range(50000):
    h = random.randint(150, 200)
    w = random.randint(35, 100)
    label = calc_bmi(h, w)
    cnt[label] += 1
    fp.write('{0},{1},{2}\n'.format(h, w, label))
fp.close()
print('good')

# BMI dataset으로 분류
from sklearn import svm, metrics
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt

tbl = pd.read_csv('bmi.csv')

# 칼럼을 정규화
label = tbl['label']
print(label)
w = tbl['weight'] / 100
h = tbl['height'] / 200
wh = pd.concat([w, h], axis=1)
print(wh.head(5), wh.shape)
'''
   weight  height
0    0.69   0.850
1    0.51   0.835
2    0.70   0.830
3    0.71   0.945
4    0.50   0.980 (50000, 2)
'''
label = label.map({'thin':0, 'normal':1, 'fat':2})
'''
0    2
1    0
2    2
3    1
4    0
'''
print(label[:5], label.shape) # (50000,)

# train/test
data_train, data_test, label_train, label_test = train_test_split(wh, label)
print(data_train.shape, data_test.shape) # (37500, 2) (12500, 2)

# model
model = svm.SVC(C=0.01).fit(data_train, label_train)
#model = svm.LinearSVC().fit(data_train, label_train)
print(model)

# 학습한 데이터의 결과가 신뢰성이 있는지 확인하기 위해 교차검증 p221
from sklearn import model_selection
cross_vali = model_selection.cross_val_score(model, wh, label, cv=3)
# k ford classification
# train 7, test 3 => train으로 3등분 하여 재검증
# 검증 학습 학습
# 학습 검증 학습
# 학습 학습 검증
print('각각의 검증 결과:', cross_vali)          # [0.96754065 0.96400072 0.96783871]
print('평균 검증 결과:', cross_vali.mean())    # 0.9664600275737195

pred = model.predict(data_test)
ac_score = metrics.accuracy_score(label_test, pred)
print('분류 정확도 :', ac_score) # 분류 정확도 : 0.96816
print(metrics.classification_report(label_test, pred))
'''
              precision    recall  f1-score   support

           0       0.98      0.97      0.98      4263
           1       0.91      0.94      0.93      2644
           2       0.98      0.98      0.98      5593

    accuracy                           0.97     12500
   macro avg       0.96      0.96      0.96     12500
weighted avg       0.97      0.97      0.97     12500
'''

# 시각화
tbl2 = pd.read_csv('bmi.csv', index_col = 2)
print(tbl2[:3])
'''
       height  weight
label                
fat       170      69
thin      167      51
fat       166      70
'''

def scatter_func(lbl, color):
    b = tbl2.loc[lbl]
    plt.scatter(b['weight'], b['height'], c=color, label=lbl)


fig = plt.figure()
scatter_func('fat', 'red')
scatter_func('normal', 'yellow')
scatter_func('thin', 'blue')
plt.legend()
plt.savefig('bmi_test.png')
plt.show()

SVM 모델로 이미지 분류

* svm4.py

from sklearn.datasets import fetch_lfw_people

fetch_lfw_people(min_faces_per_person = 60) : 인물 사진 data load. min_faces_per_person : 최초

scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_lfw_people.html

sklearn.datasets.fetch_lfw_people — scikit-learn 0.24.1 documentation

scikit-learn.org

import matplotlib.pyplot as plt
from sklearn.metrics._classification import classification_report

faces = fetch_lfw_people(min_faces_per_person = 60) 
print(faces)

print(faces.DESCR)
print(faces.data)
print(faces.data.shape) # (729, 2914)
print(faces.target)
print(faces.target_names)
print(faces.images.shape) # (729, 62, 47)

print(faces.images[0])
print(faces.target_names[faces.target[0]])
plt.imshow(faces.images[0], cmap='bone') # cmap : 색
plt.show()

fig, ax = plt.subplots(3, 5)
print(fig)          # Figure(640x480)
print(ax.flat)      # <numpy.flatiter object at 0x00000235198C5D30>
print(len(ax.flat)) # 15
for i, axi in enumerate(ax.flat):
    axi.imshow(faces.images[i], cmap='bone')
    axi.set(xticks=[], yticks=[], xlabel=faces.target_names[faces.target[i]])
plt.show()

- 주성분 분석으로 이미지 차원을 축소시켜 분류작업을 진행

from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline

m_pca = PCA(n_components=150, whiten=True, random_state = 0)
m_svc = SVC(C=1)
model = make_pipeline(m_pca, m_svc)
print(model)
# Pipeline(steps=[('pca', PCA(n_components=150, random_state=0, whiten=True)),
#                 ('svc', SVC(C=1))])

- train/test

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(faces.data, faces.target, random_state=1)
print(x_train[0], x_train.shape) # (546, 2914)
print(y_train[0], y_train.shape) # (546,)

model.fit(x_train, y_train)  # train data로 모델 fitting
pred = model.predict(x_test)
print('pred :', pred)   # pred : [1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 ..
print('read :', y_test) # read : [0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 ..

- 분류 정확도

from sklearn.metrics import classification_report

print(classification_report(y_test, pred, target_names = faces.target_names)) # 분류 정확도
'''
                   precision    recall  f1-score   support

  Donald Rumsfeld       1.00      0.25      0.40        20
    George W Bush       0.80      1.00      0.89       132
Gerhard Schroeder       1.00      0.45      0.62        31

         accuracy                           0.83       183
        macro avg       0.93      0.57      0.64       183
     weighted avg       0.86      0.83      0.79       183
=> f1-score/accuracy -> 0.83
'''

from sklearn.metrics import confusion_matrix, accuracy_score

mat = confusion_matrix(y_test, pred)
print('confusion_matrix :\n', mat)
'''
 [[  5  15   0]
 [  0 132   0]
 [  0  17  14]]
 '''
print('acc :', accuracy_score(y_test, pred)) # 0.82513

- 분류결과를 시각화

# x_test[0] 하나 미리보기.
plt.subplots(1, 1)
print(x_test[0], ' ', x_test[0].shape)
# [ 24.333334  33.        72.666664 ... 201.66667  201.33333  155.33333 ]   (2914,)
print(x_test[0].reshape(62, 47)) # 1차원을 2차원으로 변환해야 이미지 출력 가능
plt.imshow(x_test[0].reshape(62, 47), cmap='bone')
plt.show()

fig, ax = plt.subplots(4, 6)
for i, axi in enumerate(ax.flat):
    axi.imshow(x_test[i].reshape(62, 47), cmap='bone')
    axi.set(xticks=[], yticks=[])
    axi.set_ylabel(faces.target_names[pred[i]].split()[-1], color='black' if pred[i] == y_test[i] else 'red')
    fig.suptitle('pred result', size = 14)
plt.show()

- 5차 행렬 시각화

import seaborn as sns
sns.heatmap(mat.T, square = True, annot=True, fmt='d', cbar=False, \
            xticklabels=faces.target_names, yticklabels=faces.target_names)
plt.xlabel('true(read) label')
plt.ylabel('predicted label')
plt.show()

'BACK END > Deep Learning' 카테고리의 다른 글

[딥러닝] 나이브 베이즈 (0)	2021.03.17
[딥러닝] PCA (0)	2021.03.16
[딥러닝] 로지스틱 회귀 (0)	2021.03.15
[딥러닝] 다항회귀 (0)	2021.03.12
[딥러닝] 단순선형 회귀, 다중선형 회귀 (0)	2021.03.11

[코딩] Circle Square