BACK END/R

[R] R 정리 23 - 비계층적 군집분석2 (k-means)

circle kim 2021. 2. 5. 11:23

23. 계층적 분집분석 : k-means

 

 - data load

data <- read.csv("testdata/exam.csv", sep = " ")
data
#     bun kor mat eng sci
# 1    1  98  95  95  90
# 2    2  65  90  60  88
# 3    3  85  53  48  50
# 4    4  65  92  62  90
# 5    5  68  72  88  73
# 6    6  90  92  90  96
# 7    7  65  70  76  80
# 8    8  60  91  62  90
# 9    9  65  70  86  76
# 10  10 100  98  97 100

 

 -  거리계산

d_data <- dist(data, method = "euclidean")
d_data
#            1         2         3         4         5         6         7         8         9
# 2  48.414874                                                                                
# 3  75.802375 57.948253                                                                      
# 4  46.861498  4.000000 60.975405                                                            
# 5  42.225585 36.755952 52.754147 37.080992                                                  
# 6  12.609520 40.112342 73.722452 38.065733 37.656341                                        
# 7  47.021272 27.294688 48.877398 28.089144 14.491377 39.522146                              
# 8  50.970580  8.366600 62.369865  6.480741 37.403208 41.533119 27.622455                    
# 9  45.332108 35.623026 53.338541 35.791060  6.480741 39.166312 10.954451 35.199432          
# 10 14.071247 53.535035 84.852814 51.205468 50.348784 14.730920 53.469618 54.571055 52.028838

 

 - 다차원 척도법

gra_data <- cmdscale(d_data)
gra_data
plot(gra_data, type = "n") # 그래프 분석 시 4개의 군집 확인 가능
text(gra_data, as.character(1:10)) # 그래프에 text 출력

data$avg <- apply(data[, 2:5], 1, mean) # 참고용 avg - data에 행으로 mean() 함수 수행
data
#    bun kor mat eng sci   avg
# 1    1  98  95  95  90 94.50
# 2    2  65  90  60  88 75.75
# 3    3  85  53  48  50 59.00
# 4    4  65  92  62  90 77.25
# 5    5  68  72  88  73 75.25
# 6    6  90  92  90  96 92.00
# 7    7  65  70  76  80 72.75
# 8    8  60  91  62  90 75.75
# 9    9  65  70  86  76 74.25
# 10  10 100  98  97 100 98.75

 

 - k-means

library(NbClust)
data_s <- scale(data[2:5])
head(data_s)
#             kor        mat        eng        sci
# [1,]  1.4199308  0.8523469  1.0794026  0.4640535
# [2,] -0.7196910  0.5167772 -0.9517313  0.3255301
# [3,]  0.5770495 -1.9664380 -1.6481201 -2.3064152
# [4,] -0.7196910  0.6510051 -0.8356665  0.4640535
# [5,] -0.5251799 -0.6912734  0.6731758 -0.7133957
# [6,]  0.9012346  0.6510051  0.7892406  0.8796238

 

 

 -  Best 군집 수 획득

nc <- NbClust::NbClust(data_s, min.nc = 2, max.nc = 5, method = "kmeans")
nc # 4 2 3 2 1 4 1 2 1 4

 

plot(table(nc$Best.nc[1,]))

 

 - model

model_kmeans <- kmeans(data[, c("bun", "avg")], 4)
model_kmeans

table(model_kmeans$cluster)
# 1 2 3 4 
# 3 1 3 3 

cluster <- model_kmeans$cluster
cluster # 2 1 4 1 1 2 3 3 3 2
kmeans_df <- cbind(cluster, data[, c("bun", "avg")])
kmeans_df
#    cluster bun   avg
# 1        4   1 94.50
# 2        1   2 75.75
# 3        3   3 59.00
# 4        1   4 77.25
# 5        1   5 75.25
# 6        4   6 92.00
# 7        2   7 72.75
# 8        2   8 75.75
# 9        2   9 74.25
# 10       4  10 98.75

str(kmeans_df)
kmeans_df <- transform(kmeans_df, cluster = as.factor(cluster)) # 범주형으로 변경
str(kmeans_df) 

 

 - clustering

library(ggplot2)
df_plot <- ggplot(data = kmeans_df, aes(x=bun, y=avg, col = cluster)) +
            geom_point(aes(shape=factor(cluster), size = 4)) +
            ggtitle("군집분석연습")
df_plot


 - 계층적 군집분석 : k -means

data("diamonds")
diamonds
# carat cut       color clarity depth table price     x     y     z
# <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
#   1 0.23  Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
# 2 0.21  Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
# 3 0.23  Good      E     VS1      56.9    65   327  4.05  4.07  2.31
# 4 0.290 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
# 5 0.31  Good      J     SI2      63.3    58   335  4.34  4.35  2.75
# 6 0.24  Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
# 7 0.24  Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
# 8 0.26  Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
# 9 0.22  Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
# 10 0.23  Very Good H     VS1      59.4    61   338  4     4.05  2.39
typeof(diamonds) # list

length(diamonds) # 10
NROW(diamonds) # 53940
t <- sample(1:nrow(diamonds), 1000)
test <- diamonds[t, ]
dim(test) # 1000   10
str(test)

mydia <- test[c("price", "carat","depth","table")]
head(mydia)

 

1. 계층적 군집분석(탐색적)

result1 <- hclust(dist(mydia), method = "ave")
result1
plot(result1, hang = -1)

2. 비계층적 군집분석(확인적)

result2 <- kmeans(mydia, 4) # 군집 수 4
result2
names(result2)

result2$cluster

mydia$cluster <- result2$cluster
head(mydia)
#     price carat depth table cluster
# <int> <dbl> <dbl> <dbl>   <int>
#   1 11322  1.7   60.1    58       1
# 2  1009  0.5   63.8    57       3
# 3  1022  0.43  62.4    60       3
# 4  4778  1.2   58.5    61       4
# 5  4111  0.81  61.6    58       4
# 6   675  0.3   61.3    60       3

cor(mydia[, -5], method = 'pearson') # 상관계수
#             price      carat       depth      table
# price  1.00000000 0.90263800 -0.01329869  0.1134894
# carat  0.90263800 1.00000000  0.02856267  0.1670910
# depth -0.01329869 0.02856267  1.00000000 -0.3562396
# table  0.11348941 0.16709103 -0.35623960  1.0000000

plot(mydia[, -5])

 

plot(mydia[c("carat", "price")], col=mydia$cluster) # price과 carat열로 군집 결과 시각화

# 중심점 표시
points(result2$centers[, c("carat", "price")], col = c(1,2,3,4), pch=8, cex=5)