24. 연관분석 : 항목간의 연관 규칙을 발견하는 클러스터링 알고리즘의 일종

install.packages("arules")
library(arules)

 

 - arules가 원하는 transaction format 작성 연습

d_list <- list(c('a', 'b'), c('a', 'c'), c('a', 'e'), c('b', 'c', 'e'))
d_list
names(d_list) <- paste0('tr', c(1:4))
d_list
# $tr1
# [1] "a" "b"
# 
# $tr2
# [1] "a" "c"
# 
# $tr3
# [1] "a" "e"
# 
# $tr4
# [1] "b" "c" "e"
tran <- as(d_list, 'transactions')
tran
inspect(tran)
# items   transactionID
# [1] {a,b}   tr1          
# [2] {a,c}   tr2          
# [3] {a,e}   tr3          
# [4] {b,c,e} tr4

cust <- read.csv("testdata/priori_data2.csv", stringsAsFactors = F)
cust
str(cust)

cust$sangpum[cust$irum=='홍길동']

cust_list <- split(cust$sangpum, cust$irum)
cust_list
# $강감찬
# [1] "감자깡" "새우깡"
# 
# $공기밥
# [1] "자갈치" "맛동산"
# 
# $신기해
# [1] "코코넛" "맛동산"
# 
# $한국인
# [1] "짱구"   "감자깡"
# 
# $홍길동
# [1] "새우깡" "맥주"

cust_tran <- as(cust_list, 'transactions')
cust_tran
inspect(cust_tran)
# items           transactionID
# [1] {감자깡,새우깡} 강감찬       
# [2] {맛동산,자갈치} 공기밥       
# [3] {맛동산,코코넛} 신기해       
# [4] {감자깡,짱구}   한국인       
# [5] {맥주,새우깡}   홍길동  

# 연관규칙 생성
cust_rules <- apriori(cust_tran)
# confidence minval smax arem  aval originalSupport maxtime support minlen maxlen target  ext
# 0.8    0.1    1 none FALSE            TRUE       5     0.1      1     10  rules TRUE

summary(cust_tran)
inspect(cust_rules)
#       lhs         rhs      support confidence coverage lift count
# [1] {자갈치} => {맛동산} 0.2     1          0.2      2.5  1    
# [2] {코코넛} => {맛동산} 0.2     1          0.2      2.5  1    
# [3] {짱구}   => {감자깡} 0.2     1          0.2      2.5  1    
# [4] {맥주}   => {새우깡} 0.2     1          0.2      2.5  1  

customer <- read.csv("testdata/priori_data.csv", stringsAsFactors = F)
customer
#    bunho   irum    sangpum
# 1      1 홍길동     새우깡
# 2      2 홍길동     맛동산
# 3      3 홍길동       맥주
# 4      4 사오정       짱구

customer$sangpum[customer$irum == '홍길동']


customer_list <- split(customer$sangpum, customer$irum)
customer_list

customer_list <- sapply(customer_list, unique) # 중복 제거
customer_tran <- as(customer_list, 'transactions')
customer_tran
summary(customer_tran)

 

 - 정보 획득

itemFrequency(customer_tran) # 구매 빈도 출력
#    감자깡     맛동산       맥주     새우깡     자갈치       짱구 초코칩쿠키 크라운산도     포카칩 
# 0.5000000  0.6666667  0.3333333  0.3333333  0.3333333  0.5000000  0.1666667  0.1666667  0.1666667 
itemFrequency(customer_tran[, 1:3])
itemFrequencyPlot(customer_tran)
itemFrequencyPlot(customer_tran, support = 0.3)
itemFrequencyPlot(customer_tran, topN = 3)

 

 - 연관규칙

customer_rules <- apriori(customer_tran)
summary(customer_rules)

inspect(customer_rules)
inspect(sort(customer_rules, by = "lift")[1:5])
#     lhs                    rhs          support   confidence coverage  lift count
# [1] {크라운산도}        => {포카칩}     0.1666667 1          0.1666667 6    1    
# [2] {포카칩}            => {크라운산도} 0.1666667 1          0.1666667 6    1    
# [3] {맥주,크라운산도}   => {포카칩}     0.1666667 1          0.1666667 6    1    
# [4] {맥주,포카칩}       => {크라운산도} 0.1666667 1          0.1666667 6    1    
# [5] {새우깡,크라운산도} => {포카칩}     0.1666667 1          0.1666667 6    1    
inspect(sort(customer_rules, by = "lift", decreasing = FALSE)[1:5])
inspect(sort(customer_rules, by = "support", decreasing = FALSE)[1:5]) # 지지도

rule_subset <- subset(customer_rules, items %in% c('맛동산', '짱구'))
inspect(rule_subset)
inspect(rule_subset[1:5])
#     lhs                    rhs      support   confidence coverage  lift count
# [1] {초코칩쿠키}        => {맛동산} 0.1666667 1          0.1666667 1.5  1    
# [2] {자갈치}            => {맛동산} 0.3333333 1          0.3333333 1.5  2    
# [3] {자갈치,초코칩쿠키} => {맛동산} 0.1666667 1          0.1666667 1.5  1    
# [4] {맛동산,초코칩쿠키} => {자갈치} 0.1666667 1          0.1666667 3.0  1    
# [5] {자갈치,짱구}       => {맛동산} 0.1666667 1          0.1666667 1.5  1    

 

 - 시각화

install.packages("arulesViz")
library(arulesViz)
plot(customer_rules)
plot(customer_rules, method = "grouped")
plot(customer_rules, method = "graph") # 원 : 연관관계, 원의 크기 : support, 색상진하기 : lift
plot(customer_rules, method = "graph", control = list(type="items"))
plot(customer_rules, method = "graph", engine = 'interactive')

 


# 미국 식료품 매장의 판매자료

data("Groceries")
str(Groceries)
Groceries

gdf <- as(Groceries, 'data.frame')
head(gdf)

library(arules)
rules <- apriori(Groceries, parameter = list(supp=0.001, conf=0.8))
inspect(rules)
plot(rules, method="grouped")

rules <- apriori(Groceries, parameter = list(supp=0.001, conf=0.8, maxlen = 3))
inspect(rules)

rules <- sort(rules, decreasing = T, by='confidence')
inspect(rules)

library(arulesViz)
plot(rules, method = 'graph', control = list(type ='items'))

wmilk <- subset(rules, rhs %in% 'whole milk')
inspect(wmilk)
plot(wmilk, method = 'graph')

oveg <- subset(rules, rhs %in% 'other vegetables')
inspect(oveg)
plot(oveg, method = 'graph')

b_y <- subset(rules, lhs %in% c('butter', 'yogurt'))
inspect(b_y)
plot(b_y, method = 'graph')

23. 계층적 분집분석 : k-means

 

 - data load

data <- read.csv("testdata/exam.csv", sep = " ")
data
#     bun kor mat eng sci
# 1    1  98  95  95  90
# 2    2  65  90  60  88
# 3    3  85  53  48  50
# 4    4  65  92  62  90
# 5    5  68  72  88  73
# 6    6  90  92  90  96
# 7    7  65  70  76  80
# 8    8  60  91  62  90
# 9    9  65  70  86  76
# 10  10 100  98  97 100

 

 -  거리계산

d_data <- dist(data, method = "euclidean")
d_data
#            1         2         3         4         5         6         7         8         9
# 2  48.414874                                                                                
# 3  75.802375 57.948253                                                                      
# 4  46.861498  4.000000 60.975405                                                            
# 5  42.225585 36.755952 52.754147 37.080992                                                  
# 6  12.609520 40.112342 73.722452 38.065733 37.656341                                        
# 7  47.021272 27.294688 48.877398 28.089144 14.491377 39.522146                              
# 8  50.970580  8.366600 62.369865  6.480741 37.403208 41.533119 27.622455                    
# 9  45.332108 35.623026 53.338541 35.791060  6.480741 39.166312 10.954451 35.199432          
# 10 14.071247 53.535035 84.852814 51.205468 50.348784 14.730920 53.469618 54.571055 52.028838

 

 - 다차원 척도법

gra_data <- cmdscale(d_data)
gra_data
plot(gra_data, type = "n") # 그래프 분석 시 4개의 군집 확인 가능
text(gra_data, as.character(1:10)) # 그래프에 text 출력

data$avg <- apply(data[, 2:5], 1, mean) # 참고용 avg - data에 행으로 mean() 함수 수행
data
#    bun kor mat eng sci   avg
# 1    1  98  95  95  90 94.50
# 2    2  65  90  60  88 75.75
# 3    3  85  53  48  50 59.00
# 4    4  65  92  62  90 77.25
# 5    5  68  72  88  73 75.25
# 6    6  90  92  90  96 92.00
# 7    7  65  70  76  80 72.75
# 8    8  60  91  62  90 75.75
# 9    9  65  70  86  76 74.25
# 10  10 100  98  97 100 98.75

 

 - k-means

library(NbClust)
data_s <- scale(data[2:5])
head(data_s)
#             kor        mat        eng        sci
# [1,]  1.4199308  0.8523469  1.0794026  0.4640535
# [2,] -0.7196910  0.5167772 -0.9517313  0.3255301
# [3,]  0.5770495 -1.9664380 -1.6481201 -2.3064152
# [4,] -0.7196910  0.6510051 -0.8356665  0.4640535
# [5,] -0.5251799 -0.6912734  0.6731758 -0.7133957
# [6,]  0.9012346  0.6510051  0.7892406  0.8796238

 

 

 -  Best 군집 수 획득

nc <- NbClust::NbClust(data_s, min.nc = 2, max.nc = 5, method = "kmeans")
nc # 4 2 3 2 1 4 1 2 1 4

 

plot(table(nc$Best.nc[1,]))

 

 - model

model_kmeans <- kmeans(data[, c("bun", "avg")], 4)
model_kmeans

table(model_kmeans$cluster)
# 1 2 3 4 
# 3 1 3 3 

cluster <- model_kmeans$cluster
cluster # 2 1 4 1 1 2 3 3 3 2
kmeans_df <- cbind(cluster, data[, c("bun", "avg")])
kmeans_df
#    cluster bun   avg
# 1        4   1 94.50
# 2        1   2 75.75
# 3        3   3 59.00
# 4        1   4 77.25
# 5        1   5 75.25
# 6        4   6 92.00
# 7        2   7 72.75
# 8        2   8 75.75
# 9        2   9 74.25
# 10       4  10 98.75

str(kmeans_df)
kmeans_df <- transform(kmeans_df, cluster = as.factor(cluster)) # 범주형으로 변경
str(kmeans_df) 

 

 - clustering

library(ggplot2)
df_plot <- ggplot(data = kmeans_df, aes(x=bun, y=avg, col = cluster)) +
            geom_point(aes(shape=factor(cluster), size = 4)) +
            ggtitle("군집분석연습")
df_plot


 - 계층적 군집분석 : k -means

data("diamonds")
diamonds
# carat cut       color clarity depth table price     x     y     z
# <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
#   1 0.23  Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
# 2 0.21  Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
# 3 0.23  Good      E     VS1      56.9    65   327  4.05  4.07  2.31
# 4 0.290 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
# 5 0.31  Good      J     SI2      63.3    58   335  4.34  4.35  2.75
# 6 0.24  Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
# 7 0.24  Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
# 8 0.26  Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
# 9 0.22  Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
# 10 0.23  Very Good H     VS1      59.4    61   338  4     4.05  2.39
typeof(diamonds) # list

length(diamonds) # 10
NROW(diamonds) # 53940
t <- sample(1:nrow(diamonds), 1000)
test <- diamonds[t, ]
dim(test) # 1000   10
str(test)

mydia <- test[c("price", "carat","depth","table")]
head(mydia)

 

1. 계층적 군집분석(탐색적)

result1 <- hclust(dist(mydia), method = "ave")
result1
plot(result1, hang = -1)

2. 비계층적 군집분석(확인적)

result2 <- kmeans(mydia, 4) # 군집 수 4
result2
names(result2)

result2$cluster

mydia$cluster <- result2$cluster
head(mydia)
#     price carat depth table cluster
# <int> <dbl> <dbl> <dbl>   <int>
#   1 11322  1.7   60.1    58       1
# 2  1009  0.5   63.8    57       3
# 3  1022  0.43  62.4    60       3
# 4  4778  1.2   58.5    61       4
# 5  4111  0.81  61.6    58       4
# 6   675  0.3   61.3    60       3

cor(mydia[, -5], method = 'pearson') # 상관계수
#             price      carat       depth      table
# price  1.00000000 0.90263800 -0.01329869  0.1134894
# carat  0.90263800 1.00000000  0.02856267  0.1670910
# depth -0.01329869 0.02856267  1.00000000 -0.3562396
# table  0.11348941 0.16709103 -0.35623960  1.0000000

plot(mydia[, -5])

 

plot(mydia[c("carat", "price")], col=mydia$cluster) # price과 carat열로 군집 결과 시각화

# 중심점 표시
points(result2$centers[, c("carat", "price")], col = c(1,2,3,4), pch=8, cex=5)

22. 비계층적 군집분석 : 군집수를 정해 주고 분석을 진행
 : k-means가 가장 많이 쓰임

iris_s <- scale(iris[-5]) # scale : 표준화 함수를 이용. Species열은 제외
head(iris_s, 3)

 - 군집 개수 결정 후 k-means 모델 작성

install.packages("NbClust")
library(NbClust)
nc <- NbClust(iris_s, min.nc = 2, max.nc = 10, method="kmeans") # nc 클러스터 수

table(nc$Best.nc[1, ])
# 0  2  3  8 10 
# 2 11 11  1  1 
plot(table(nc$Best.nc[1, ]))

iris_k <- kmeans(iris_s, centers = 3, iter.max = 100)
iris_k

names(iris_k)

table(iris_k$cluster)
# 1  2  3 
# 62 50 38
plot(iris_k$cluster, col=iris_k$cluster)

iris$Species2 <- ifelse(iris$Species == 'setosa', 1, iris$Species)
head(iris, 3)
plot(iris$Species2, col=iris$Species2)

 

 - PAM

library(cluster)
iris_pam <- pam(iris_s, 3)
iris_pam
names(iris_pam)
(iris_pam$clustering)
table(iris_pam$clustering) # 교차 분할표
# 1  2  3 
# 50 52 48 
clusplot(iris_pam)

 

21. 군집분석(Clustering) : 비지도학습 - 유클리디안 거리 계산법 사용

 

x <- matrix(1:16, nrow = 4)
x
# [,1] [,2] [,3] [,4]
# [1,]    1    5    9   13
# [2,]    2    6   10   14
# [3,]    3    7   11   15
# [4,]    4    8   12   16
help(dist)
d <- dist(x, method = "euclidean")
d
#   1 2 3
# 2 2    
# 3 4 2  
# 4 6 4 2
plot(d)
text(d, c(LETTERS[1:6]))

txt1 <- read.csv("testdata/cluster_ex.csv")
txt1
# irum kor eng
# 1 홍길동  80  90
# 2 이기자  70  40
# 3 유별나  65  75
# 4 강나루  85  65
# 5 전속력  95  87
plot(txt1[, c(2:3)],
     xlab ='국어',
     ylab ='영어',
     xlim = c(30, 100),
     ylim = c(30, 100),
     main = '학생점수')

text(txt1[, 2], txt1[, 3], labels = abbreviate(rownames(txt1)), cex = 0.8, pos = 1, col = "blue")
text(txt1[, 2], txt1[, 3], labels = txt1[,1], cex = 0.8, pos = 2, col = "red")

dist_m <- dist(txt1[c(1:2), c(2:3)], method = "manhattan")
dist_m
# 1
# 2 60

dist_e <- dist(txt1[c(1:2), c(2:3)], method = "euclidean")
dist_e
# 1
# 2 50.9902

 - 계층적 군집분석

x <- c(1,2,2,4,5)
y <- c(1,1,4,3,5)
xy <- data.frame(cbind(x,y))
plot(xy,
     xlab ='x',
     ylab ='y',
     xlim = c(0, 6),
     ylim = c(0, 6),
     main = '응집적 군집분석')

text(xy[, 1], xy[, 2], labels = abbreviate(rownames(xy)),
     cex = 0.8, pos = 1, col = "blue")
abline(v=c(3), col = 'gray', lty=2)
abline(h=c(3), col = 'gray', lty=2)

 

 - 유클리디안 거리 계산법

dist(xy, method = 'euclidean') ^ 2
#    1  2  3  4
# 2  1         
# 3 10  9      
# 4 13  8  5   
# 5 32 25 10  5

 

 - Dendrogram으로 출력

hc_sl <- hclust(dist(xy) ^ 2, method = "single") # 최단거리법
hc_sl
plot(hc_sl, hang = -1)

hc_co <- hclust(dist(xy) ^ 2, method = "complete") # 완전(최장) 거리법
hc_co
plot(hc_co, hang = -1)

hc_av <- hclust(dist(xy) ^ 2, method = "average") # 평균 거리법
hc_av
plot(hc_av, hang = -1)

par(oma = c(3, 0, 1, 0))
par(mfrow = c(1,3))
plot(hc_sl, hang = -1)
plot(hc_co, hang = -1)
plot(hc_av, hang = -1)


 - 중학생 신체검사 결과

body <- read.csv("testdata/bodycheck.csv")
body
#      번호 악력 신장 체중 안경유무
# 1     1   28  146   34        1
# 2     2   46  169   57        2
# 3     3   39  160   48        2
# 4     4   25  156   38        1
# 5     5   34  161   47        1
# 6     6   29  168   50        1
# 7     7   38  154   54        2
# 8     8   23  153   40        1
# 9     9   42  160   62        2
# 10   10   27  152   39        1
# 11   11   35  155   46        1
# 12   12   39  154   54        2
# 13   13   38  157   57        2
# 14   14   32  162   53        2
# 15   15   25  142   32        1
dim(body)
head(body, 2)

d <- dist(body[, -1]) # 거리계산
d

hc <- hclust(d, method = "complete")
hc
# Cluster method   : complete 
# Distance         : euclidean 
# Number of objects: 15 

plot(hc, hang=-1) # hang=-1 정렬
rect.hclust(hc, k=3, border = "red")

 -  군집별 특징

g1 <- subset(body, 번호 == 10 |번호 == 4 |번호 == 8 |번호 == 1 |번호 == 15)
g2 <- subset(body, 번호 == 11 |번호 == 3 |번호 == 5 |번호 == 6 |번호 == 14)
g3 <- subset(body, 번호 == 2 |번호 == 9 |번호 == 13 |번호 == 7 |번호 == 12)

g1[2:5]
g2[2:5]
g3[2:5]

summary(g1[2:5])
summary(g2[2:5])
summary(g3[2:5])

20. Neural Network : MLP - 역전파 지원 (deep learning)

 

- 라이브러리 load

install.packages("neuralnet")
library(neuralnet)

 

 - 데이터

head(iris, 2)
unique(iris$Species) # setosa     versicolor virginica 
iris$Species2[iris$Species == 'setosa'] <- 1
iris$Species2[iris$Species == 'versicolor'] <- 2
iris$Species2[iris$Species == 'virginica'] <- 3
iris$Species <- NULL
head(iris, 2)
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species2
# 1          5.1         3.5          1.4         0.2        1
# 2          4.9         3.0          1.4         0.2        1

 

 - train, test

set.seed(42)
idx <- sample(1:nrow(iris), nrow(iris)*0.7)
train <- iris[idx, ]
test <- iris[-idx, ]

 

 - 정규화

normal_func <- function(x){
  return ((x - min(x) / max(x) - min(x)))
}
normal_func(c(1,2,3))

train_nor <- as.data.frame(lapply(train, normal_func))
head(train_nor, 3)
# Sepal.Length Sepal.Width Petal.Length Petal.Width   Species2
# 1    0.4556962   1.2454545    0.2358209        0.06 -0.3333333
# 2    0.7556962   0.4454545    2.3358209        1.16  0.6666667
# 3    1.2556962   0.3454545    3.4358209        1.06  0.6666667
test_nor <- as.data.frame(lapply(test, normal_func))
head(test_nor, 3)
# Sepal.Length Sepal.Width Petal.Length Petal.Width   Species2
# 1   -0.3714286   0.6210526    0.2550725        0.02 -0.3333333
# 2    0.4285714   0.9210526    0.3550725       -0.08 -0.3333333
# 3   -0.1714286   0.6210526    0.4550725       -0.08 -0.3333333

 

- 모델생성

help("neuralnet")
model <- neuralnet(Species2 ~ Sepal.Length+Sepal.Width+Petal.Length+Petal.Width,
                   data=train_nor, hidden = 1) # 레이블, 피쳐
model
plot(model)

 - 모델 성능평가 : predict() x compute() 사용

model_result <- compute(model, test_nor[c(1:4)])
names(model_result) # neurons    net.result
model_result$neurons
head(model_result$net.result, 3) # 예측값
# [,1]
# [1,] -0.2967633
# [2,] -0.3408647
# [3,] -0.3024315
head(test_nor$Species2, 3) # 실제값

 

 -  상관관계 확인 후 분류 정확도 출력

cor(model_result$net.result, test_nor$Species2) # 0.9762009

pred_weights <- model_result$net.result

func <- function(x){
  if(x >= 1)
    return ('virginica')
  else if(x >= 0)
    return ('versicolor')
  else
    return ('setosa')
}
func(-1)
func(2)
func(0.2)
sp <- apply(pred_weights, 1, func)
sp
t <- table(sp, test_nor$Species2)
# sp           -0.333333333333333 0.666666666666667 1.66666666666667
# setosa                     12                 0                0
# versicolor                  0                12                0
# virginica                   0                 3               18
sum(diag(t)) / nrow(test_nor)

 

 - 모델 파라미터 변경

model2 <- neuralnet(Species2 ~ Sepal.Length+Sepal.Width+Petal.Length+Petal.Width,
                   data=train_nor, hidden = 5, algorithm = "backprop", learningrate = 0.01)
# learningrate : 학습률
model2
plot(model2)

 


 - 입력값을 정규화 하지않고 모델 작성

data(iris)
head(iris, 2)
unique(iris$Species)
set.seed(123)
idx <- sample(1:nrow(iris), nrow(iris)*0.7)
train <- iris[idx, ]
test <- iris[-idx, ]

train <- cbind(train, train$Species == 'setosa')
train <- cbind(train, train$Species == 'versicolor')
train <- cbind(train, train$Species == 'virginica')
train
names(train)[6:8] <- c('setosa', 'versicolor', 'virginica')
head(train, 2)

model <- neuralnet(setosa+versicolor+virginica ~ Sepal.Length+Sepal.Width+Petal.Length+Petal.Width,
                   data=train, hidden = 3)
plot(model)

 

pred <- compute(model, test[-5])
pred_weight <- pred$net.result
idx <- apply(pred_weight, 1, which.max) # 행에서 가장 큰값을 반환
idx
# 1   2   3   5  11  18  19  28  29  33  36  45  48  49  55  56  57  58  59  61  62  65  66  68  70  77 
# 1   1   1   1   1   1   1   1   1   1   1   1   1   1   2   2   2   2   2   2   2   2   2   2   2   2 
# 83  84  94  95  98 100 101 104 105 111 113 116 125 131 133 135 140 141 145 
# 2   3   2   2   2   2   3   3   3   3   3   3   3   3   3   3   3   3   3 
c('setosa', 'versicolor', 'virginica')[1]

pred <- c('setosa', 'versicolor', 'virginica')[idx]
pred
# [1] "setosa"     "setosa"     "setosa"     "setosa"     "setosa"     "setosa"     "setosa"    
# [8] "setosa"     "setosa"     "setosa"     "setosa"     "setosa"     "setosa"     "setosa"    
# [15] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor" "versicolor" "versicolor"
# [22] "versicolor" "versicolor" "versicolor" "versicolor" "versicolor" "versicolor" "virginica" 
# [29] "versicolor" "versicolor" "versicolor" "versicolor" "virginica"  "virginica"  "virginica" 
# [36] "virginica"  "virginica"  "virginica"  "virginica"  "virginica"  "virginica"  "virginica" 
# [43] "virginica"  "virginica"  "virginica" 
table(pred, test$Species)
# pred         setosa versicolor virginica
# setosa         14          0         0
# versicolor      0         17         0
# virginica       0          1        13

 

 - 새로운 값으로 예측

my <- test
my <- my[c(1:3), ]
my <- edit(my)
my
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# 1            7           1            1           1  setosa
# 2            7           5            3           1  setosa
# 3            2           3            4           5  setosa

mycomp <- compute(model, my[-5])
mypred <- mycomp$net.result
idx2 <- apply(mypred, 1, which.max)
idx2
pred2 <- c('setosa', 'versicolor', 'virginica')[idx2]
pred2 # versicolor setosa virginica

'BACK END > R' 카테고리의 다른 글

[R] R 정리 22 - 비계층적 군집분석  (0) 2021.02.04
[R] R 정리 21 - 계층적 군집분석  (0) 2021.02.04
[R] R 정리 19 - ANN(인공 신경망)  (0) 2021.02.03
[R] R 정리 18 - svm, knn  (0) 2021.02.02
[R] R 정리 17 - Naive Bayes  (0) 2021.02.02

19. ANN (Artificial Neural Network)

 

 1) 논리 회로 처리

 - 라이브러리 read

install.packages("nnet")
library(nnet)

 

 - 학습 모델 작성

input <- matrix(c(0,0,1,1,0,1,0,1), ncol=2)
input
# [,1] [,2]
# [1,]    0    0
# [2,]    0    1
# [3,]    1    0
# [4,]    1    1
output <- matrix(c(0,0,0,1)) # and
output
# [,1]
# [1,]    0
# [2,]    0
# [3,]    0
# [4,]    1
output <- matrix(c(0,1,1,1)) # or
output <- matrix(c(0,1,1,0)) # xor - size가 1로는 비선형이라 안됨.

ann <- nnet(input, output, maxit = 10000, size = 3, decay = 0.001)
# maxit : 학습 횟수, size: node수, decay: 학습률
ann

 - 결과 확인

result <- predict(ann, input) # 예측모델 생성
result
#             [,1]
# [1,] 0.002724474
# [2,] 0.070330761
# [3,] 0.070331671
# [4,] 0.897269791
ifelse(result>0.5, 1, 0) # 0.5 이상일경우 1 아닐경우 0으로 판단
#      [,1]
# [1,]    0
# [2,]    0
# [3,]    0
# [4,]    1

df <- data.frame(
  x1 = c(1:6),
  x2 = c(6:1),
  y  = factor(c('n','n','n','y','y','y'))
)
df
# x1 x2 y
# 1  1  6 n
# 2  2  5 n
# 3  3  4 n
# 4  4  3 y
# 5  5  2 y
# 6  6  1 y
str(df)
model_net1 <- nnet(y ~ ., df, size = 1)
model_net1
# a 2-1-1 network with 5 weights
# inputs: x1 x2 
# output(s): y 
# options were - entropy fitting
summary(model_net1)

model_net2 <- nnet(y ~ ., df, size = 2)
model_net2
# a 2-2-1 network with 9 weights
# inputs: x1 x2 
# output(s): y 
# options were - entropy fitting
summary(model_net2)
install.packages("devtools")
library(devtools)
source_url('https://gist.githubusercontent.com/fawda123/7471137/raw/466c1474d0a505ff044412703516c34f1a4684a5/nnet_plot_update.r')
par(mar=c(1,1,1,1))
plot.nnet(summary(model_net1))

plot.nnet(summary(model_net2))

# predict
model_net2$fitted.values
predict(model_net2, df)

pred <- predict(model_net2, df, type='class')
pred
table(pred, df$y)
# pred n y
# n 3 0
# y 0 3

# iris dataset 사용
data(iris)
set.seed(123)
idx <- sample(1:nrow(iris), nrow(iris)*0.7)
train <- iris[idx, ]
test <- iris[-idx, ]

# node 1개 사용한 경우
model_iris1 = nnet::nnet(Species ~ ., train, size = 1)
model_iris1
# a 4-1-3 network with 11 weights
# inputs: Sepal.Length Sepal.Width Petal.Length Petal.Width 
# output(s): Species 
# options were - softmax modelling 
summary(model_iris1)  # weights 11로 weights가 클수록 정교한 분석가능
plot.nnet(summary(model_iris1))

# node 3개 사용한 경우
model_iris3 = nnet::nnet(Species ~ ., train, size = 3)
summary(model_iris3)
plot.nnet(summary(model_iris3))

# 분류평가
pred1 <- predict(model_iris1, test, type="class")
t1 <- table(pred1, test$Species)
sum(diag(t1))/nrow(test) # 0.9777778

pred3 <- predict(model_iris3, test, type="class")
t3 <- table(pred3, test$Species)
sum(diag(t3))/nrow(test)

'BACK END > R' 카테고리의 다른 글

[R] R 정리 21 - 계층적 군집분석  (0) 2021.02.04
[R] R 정리 20 - MLP(deep learning)  (0) 2021.02.04
[R] R 정리 18 - svm, knn  (0) 2021.02.02
[R] R 정리 17 - Naive Bayes  (0) 2021.02.02
[R] R 정리 16 - Random Forest  (0) 2021.02.02

# SVM 분류 모델 : 데이터 분류 결정 마진에 영향을 주는 관측치(Support Vectors)를 이용
# Kernel trick을 이용해 비선형 데이터에 대한 분류도 가능

library(e1071)

# train/test 생략
head(iris, 3)
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# 1          5.1         3.5          1.4         0.2  setosa
# 2          4.9         3.0          1.4         0.2  setosa
# 3          4.7         3.2          1.3         0.2  setosa
x <- subset(iris, select = -Species)
y <- iris$Species
x
y
svm_model <- svm(Species ~ ., data=iris)
svm_model
# Parameters:
#   SVM-Type:  C-classification 
# SVM-Kernel:  radial 
# cost:  1 
# 
# Number of Support Vectors:  51
summary(svm_model)

svm_model1 <- svm(x, y)
summary(svm_model1)

dist(iris[, -5])
cmdscale(dist(iris[, -5])) # 거리 값을 공간적인 배치형식으로 출력
# [,1]         [,2]
# [1,] -2.684125626  0.319397247
# [2,] -2.714141687 -0.177001225
# [3,] -2.888990569 -0.144949426
# [4,] -2.745342856 -0.318298979
# [5,] -2.728716537  0.326754513
# [6,] -2.280859633  0.741330449
# [7,] -2.820537751 -0.089461385
# [8,] -2.626144973  0.163384960

# support vector : + , 나머지 데이터는 o
plot(cmdscale(dist(iris[, -5])), col=as.integer(iris[,5]), pch=c("o","+")[1:150 %in% svm_model$index + 1])

 

pred <- predict(svm_model, x)
head(pred)
t <- table(pred, y)
t
# y
# pred         setosa versicolor virginica
# setosa         50          0         0
# versicolor      0         48         2
# virginica       0          2        48
sum(diag(t)) / nrow(x) # 분류 정확도 : 0.9733333

# 최상의 모델을 만들기 위해 파라미터 조절 - svm 튜닝

svm_tune <- tune(svm, train.x = x, train.y = y, kernel='radial',
                 ranges = list(cost=10^(-1:2)), gamma=2^(-2:2))
                # 10-fold cross validation : 교차 검정
svm_tune
# - sampling method: 10-fold cross validation 
# 
# - best parameters:
#   cost
# 1
# 
# - best performance: 0.03333333 

svm_model_after_tune <- svm(Species ~ ., data=iris, kernel='radial', cost=1, gamma=0.5)
summary(svm_model_after_tune)
pred <- predict(svm_model_after_tune, x)
t <- table(pred, y)
t
# y
# pred         setosa versicolor virginica
# setosa         50          0         0
# versicolor      0         48         2
# virginica       0          2        48
sum(diag(t)) / nrow(x) # 0.9733333

# KNN : 최근접 이웃 알고리즘을 사용. k개의 원 안에 데이터 수 
# 또는 거리(유클디안 거리 계산법)에 의해 새로운 데이터를 분류.
# 비모수적 검정 분류 모형

install.packages("ggvis")
library(ggvis)

head(iris, 2)
iris %>% ggvis(~Petal.Length, ~Petal.Width, fill=~factor(Species))


 

'BACK END > R' 카테고리의 다른 글

[R] R 정리 20 - MLP(deep learning)  (0) 2021.02.04
[R] R 정리 19 - ANN(인공 신경망)  (0) 2021.02.03
[R] R 정리 17 - Naive Bayes  (0) 2021.02.02
[R] R 정리 16 - Random Forest  (0) 2021.02.02
[R] R 정리 15 - Decision Tree  (0) 2021.02.02

# 분류 모델 중 Naive Bayes claaifier 모델 : 조건부 확률을 이용. P(A|B): 사건 A가 발생했을때 B가 발생할 확률

 

 

library(e1071) # Naive Bayes

set.seed(123)
ind <- sample(1:nrow(iris), nrow(iris) * 0.7, replace = FALSE)
train <- iris[ind, ]
test <- iris[-ind, ]

help(naiveBayes)
model <- naiveBayes(Species ~ ., data = train)
model
# naiveBayes.default(x = X, y = Y, laplace = laplace)
# 
# A-priori probabilities:
#   Y
# setosa versicolor  virginica 
# 0.3428571  0.3047619  0.3523810 
# 
# Conditional probabilities:
#   Sepal.Length
# Y                [,1]      [,2]
# setosa     4.966667 0.3741657
# versicolor 5.971875 0.4887340
# virginica  6.586486 0.7165357
# 
# Sepal.Width
# Y                [,1]      [,2]
# setosa     3.394444 0.4049299
# versicolor 2.787500 0.3250310
# virginica  2.948649 0.3500965
# 
# Petal.Length
# Y                [,1]      [,2]
# setosa     1.461111 0.1777282
# versicolor 4.309375 0.4423977
# virginica  5.529730 0.6235494
# 
# Petal.Width
# Y                 [,1]      [,2]
# setosa     0.2555556 0.1157447
# versicolor 1.3500000 0.1951013
# virginica  1.9945946 0.2613490
pred <- predict(model, test, type='class')
head(pred, 3)
t <- table(pred, test$Species)
t
# pred         setosa versicolor virginica
# setosa         14          0         0
# versicolor      0         18         0
# virginica       0          0        13
sum(diag(t)) / nrow(test) # 1
prop.table(t) # margin = 1
# pred             setosa versicolor  virginica
# setosa     0.33333333 0.00000000 0.00000000
# versicolor 0.00000000 0.37777778 0.02222222
# virginica  0.00000000 0.00000000 0.26666667

'BACK END > R' 카테고리의 다른 글

[R] R 정리 19 - ANN(인공 신경망)  (0) 2021.02.03
[R] R 정리 18 - svm, knn  (0) 2021.02.02
[R] R 정리 16 - Random Forest  (0) 2021.02.02
[R] R 정리 15 - Decision Tree  (0) 2021.02.02
[R] R 정리 14 - 로지스틱 회귀2 ROC  (0) 2021.02.01

#Random Forest 분류 모델 나무 구조 모델을 여러 개 합칩 모델(앙상블 기법)

install.packages("randomForest")
library(randomForest)

set.seed(123)
ind <- sample(1:nrow(iris), nrow(iris) * 0.7, replace = FALSE)
train <- iris[ind, ]
test <- iris[-ind, ]

model <- randomForest(formula=Species ~ ., data=train)
model
# randomForest(formula = Species ~ ., data = train) 
# Type of random forest: classification
# Number of trees: 500
# No. of variables tried at each split: 2
# 
# OOB estimate of  error rate: 5.71%
# Confusion matrix:
#   setosa versicolor virginica class.error
# setosa         36          0         0  0.00000000
# versicolor      0         29         3  0.09375000
# virginica       0          3        34  0.08108108

model2 <- randomForest(formula=Species ~ ., data=train, ntree = 200, mtry = 3, na.action = na.omit)
model2
# randomForest(formula = Species ~ ., data = train, ntree = 200,      mtry = 3, na.action = na.omit) 
# Type of random forest: classification
# Number of trees: 200
# No. of variables tried at each split: 3
# 
# OOB estimate of  error rate: 4.76%
# Confusion matrix:
#   setosa versicolor virginica class.error
# setosa         36          0         0  0.00000000
# versicolor      0         30         2  0.06250000
# virginica       0          3        34  0.08108108

model3 <- randomForest(formula=Species ~ ., data=train, ntree = 1000, mtry = 2, na.action = na.omit)
model3

# 모델 성능 향상을 위한 변수 선택, ntree, mtry 설정하기
# 중요 변수

model4 <- randomForest(Species ~ ., data=train, importance = T)
importance(model4) # Petal.Width가 가장 중요한 변수
# setosa versicolor virginica MeanDecreaseAccuracy MeanDecreaseGini
# Sepal.Length  5.947974  9.6040622  7.628999            12.547747         6.743408
# Sepal.Width   4.143228 -0.5277106  7.616918             6.344268         1.895498
# Petal.Length 22.586135 31.0203420 28.842917            34.060288        30.182599
# Petal.Width  21.470279 27.2038165 30.353685            32.467796        30.336028

varImpPlot(model4)

#pred

pred <- predict(model, test)
pred
t <- table(pred, test$Species)
t
# pred         setosa versicolor virginica
# setosa         14          0         0
# versicolor      0         17         0
# virginica       0          1        13

sum(diag(t)) / nrow(test) # 0.9777778

#..
pred4 <- predict(model4, test)
pred4
t4 <- table(pred4, test$Species)
sum(diag(t4)) / nrow(test) # 0.9777778

# ntree, mtry 최적 값 얻기

ntree <- c(400, 500, 600)
mtry <- c(2:4)
param <- data.frame(n=ntree, m=mtry)

for(i in param$n){
  cat('ntree : ', i, '\n')
  for(j in param$m){
    cat('mtree : ', j, '\n')
    model_ir = randomForest(Species ~ ., data=iris, ntree=i, mtry=j, na.action = na.omit)
    print(model_ir)
  }
}

 

'BACK END > R' 카테고리의 다른 글

[R] R 정리 18 - svm, knn  (0) 2021.02.02
[R] R 정리 17 - Naive Bayes  (0) 2021.02.02
[R] R 정리 15 - Decision Tree  (0) 2021.02.02
[R] R 정리 14 - 로지스틱 회귀2 ROC  (0) 2021.02.01
[R] R 정리 13 - 로지스틱 회귀  (0) 2021.02.01

# 분류모델 중 Decision Tree

set.seed(123)
ind <- sample(1:nrow(iris), nrow(iris) * 0.7, replace = FALSE)
train <- iris[ind, ]
test <- iris[-ind, ]

 

# ctree

install.packages("party")
library(party)
iris_ctree <- ctree(formula = Species ~ ., data = train)
iris_ctree
# Conditional inference tree with 4 terminal nodes
# 
# Response:  Species 
# Inputs:  Sepal.Length, Sepal.Width, Petal.Length, Petal.Width 
# Number of observations:  105 
# 
# 1) Petal.Length <= 1.9; criterion = 1, statistic = 97.466
# 2)*  weights = 36 
# 1) Petal.Length > 1.9
# 3) Petal.Width <= 1.7; criterion = 1, statistic = 45.022
# 4) Petal.Length <= 4.6; criterion = 0.987, statistic = 8.721
# 5)*  weights = 25 
# 4) Petal.Length > 4.6
# 6)*  weights = 10 
# 3) Petal.Width > 1.7
# 7)*  weights = 34
plot(iris_ctree, type = "simple")
plot(iris_ctree)

 

# predit

pred <- predict(iris_ctree, test)
pred
t <- table(pred, test$Species) # 인자 : 예측값, 실제값
t
# pred         setosa versicolor virginica
# setosa         14          0         0
# versicolor      0         18         1
# virginica       0          0        12
sum(diag(t)) / nrow(test) # 0.9777778
library(caret)
confusionMatrix(pred, test$Species)
# Confusion Matrix and Statistics
# 
# Reference
# Prediction   setosa versicolor virginica
# setosa         14          0         0
# versicolor      0         18         1
# virginica       0          0        12
# 
# Overall Statistics
# 
# Accuracy : 0.9778          
# 95% CI : (0.8823, 0.9994)
# No Information Rate : 0.4             
# P-Value [Acc > NIR] : < 2.2e-16       
# 
# Kappa : 0.9662          
# 
# Mcnemar's Test P-Value : NA              
# 
# Statistics by Class:
# 
#                      Class: setosa Class: versicolor Class: virginica
# Sensitivity                 1.0000            1.0000           0.9231
# Specificity                 1.0000            0.9630           1.0000
# Pos Pred Value              1.0000            0.9474           1.0000
# Neg Pred Value              1.0000            1.0000           0.9697
# Prevalence                  0.3111            0.4000           0.2889
# Detection Rate              0.3111            0.4000           0.2667
# Detection Prevalence        0.3111            0.4222           0.2667
# Balanced Accuracy           1.0000            0.9815           0.9615

 

# 방법2 : rpart : 가지치기 이용

library(rpart)

iris_rpart <- rpart(Species ~ ., data = train, method = 'class')
x11()
plot(iris_rpart)
text(iris_rpart)

plotcp(iris_rpart)
printcp(iris_rpart)

cp <- iris_rpart$cptable[which.min(iris_rpart$cptable[, 'xerror'])]
iris_rpart_prune <- prune(iris_rpart, cp=cp, 'cp')
x11()
plot(iris_rpart_prune)
text(iris_rpart_prune)

'BACK END > R' 카테고리의 다른 글

[R] R 정리 17 - Naive Bayes  (0) 2021.02.02
[R] R 정리 16 - Random Forest  (0) 2021.02.02
[R] R 정리 14 - 로지스틱 회귀2 ROC  (0) 2021.02.01
[R] R 정리 13 - 로지스틱 회귀  (0) 2021.02.01
[R] R 정리 12 - 다중 선형 회귀 DB  (0) 2021.01.29

+ Recent posts

123