BACK END/R

[R] R 정리 16 - Random Forest

circle kim 2021. 2. 2. 11:25

#Random Forest 분류 모델 나무 구조 모델을 여러 개 합칩 모델(앙상블 기법)

install.packages("randomForest")
library(randomForest)

set.seed(123)
ind <- sample(1:nrow(iris), nrow(iris) * 0.7, replace = FALSE)
train <- iris[ind, ]
test <- iris[-ind, ]

model <- randomForest(formula=Species ~ ., data=train)
model
# randomForest(formula = Species ~ ., data = train) 
# Type of random forest: classification
# Number of trees: 500
# No. of variables tried at each split: 2
# 
# OOB estimate of  error rate: 5.71%
# Confusion matrix:
#   setosa versicolor virginica class.error
# setosa         36          0         0  0.00000000
# versicolor      0         29         3  0.09375000
# virginica       0          3        34  0.08108108

model2 <- randomForest(formula=Species ~ ., data=train, ntree = 200, mtry = 3, na.action = na.omit)
model2
# randomForest(formula = Species ~ ., data = train, ntree = 200,      mtry = 3, na.action = na.omit) 
# Type of random forest: classification
# Number of trees: 200
# No. of variables tried at each split: 3
# 
# OOB estimate of  error rate: 4.76%
# Confusion matrix:
#   setosa versicolor virginica class.error
# setosa         36          0         0  0.00000000
# versicolor      0         30         2  0.06250000
# virginica       0          3        34  0.08108108

model3 <- randomForest(formula=Species ~ ., data=train, ntree = 1000, mtry = 2, na.action = na.omit)
model3

# 모델 성능 향상을 위한 변수 선택, ntree, mtry 설정하기
# 중요 변수

model4 <- randomForest(Species ~ ., data=train, importance = T)
importance(model4) # Petal.Width가 가장 중요한 변수
# setosa versicolor virginica MeanDecreaseAccuracy MeanDecreaseGini
# Sepal.Length  5.947974  9.6040622  7.628999            12.547747         6.743408
# Sepal.Width   4.143228 -0.5277106  7.616918             6.344268         1.895498
# Petal.Length 22.586135 31.0203420 28.842917            34.060288        30.182599
# Petal.Width  21.470279 27.2038165 30.353685            32.467796        30.336028

varImpPlot(model4)

#pred

pred <- predict(model, test)
pred
t <- table(pred, test$Species)
t
# pred         setosa versicolor virginica
# setosa         14          0         0
# versicolor      0         17         0
# virginica       0          1        13

sum(diag(t)) / nrow(test) # 0.9777778

#..
pred4 <- predict(model4, test)
pred4
t4 <- table(pred4, test$Species)
sum(diag(t4)) / nrow(test) # 0.9777778

# ntree, mtry 최적 값 얻기

ntree <- c(400, 500, 600)
mtry <- c(2:4)
param <- data.frame(n=ntree, m=mtry)

for(i in param$n){
  cat('ntree : ', i, '\n')
  for(j in param$m){
    cat('mtree : ', j, '\n')
    model_ir = randomForest(Species ~ ., data=iris, ntree=i, mtry=j, na.action = na.omit)
    print(model_ir)
  }
}