BACK END/R
[R] R 정리 16 - Random Forest
circle kim
2021. 2. 2. 11:25
#Random Forest 분류 모델 나무 구조 모델을 여러 개 합칩 모델(앙상블 기법)
install.packages("randomForest")
library(randomForest)
set.seed(123)
ind <- sample(1:nrow(iris), nrow(iris) * 0.7, replace = FALSE)
train <- iris[ind, ]
test <- iris[-ind, ]
model <- randomForest(formula=Species ~ ., data=train)
model
# randomForest(formula = Species ~ ., data = train)
# Type of random forest: classification
# Number of trees: 500
# No. of variables tried at each split: 2
#
# OOB estimate of error rate: 5.71%
# Confusion matrix:
# setosa versicolor virginica class.error
# setosa 36 0 0 0.00000000
# versicolor 0 29 3 0.09375000
# virginica 0 3 34 0.08108108
model2 <- randomForest(formula=Species ~ ., data=train, ntree = 200, mtry = 3, na.action = na.omit)
model2
# randomForest(formula = Species ~ ., data = train, ntree = 200, mtry = 3, na.action = na.omit)
# Type of random forest: classification
# Number of trees: 200
# No. of variables tried at each split: 3
#
# OOB estimate of error rate: 4.76%
# Confusion matrix:
# setosa versicolor virginica class.error
# setosa 36 0 0 0.00000000
# versicolor 0 30 2 0.06250000
# virginica 0 3 34 0.08108108
model3 <- randomForest(formula=Species ~ ., data=train, ntree = 1000, mtry = 2, na.action = na.omit)
model3
# 모델 성능 향상을 위한 변수 선택, ntree, mtry 설정하기
# 중요 변수
model4 <- randomForest(Species ~ ., data=train, importance = T)
importance(model4) # Petal.Width가 가장 중요한 변수
# setosa versicolor virginica MeanDecreaseAccuracy MeanDecreaseGini
# Sepal.Length 5.947974 9.6040622 7.628999 12.547747 6.743408
# Sepal.Width 4.143228 -0.5277106 7.616918 6.344268 1.895498
# Petal.Length 22.586135 31.0203420 28.842917 34.060288 30.182599
# Petal.Width 21.470279 27.2038165 30.353685 32.467796 30.336028
varImpPlot(model4)
#pred
pred <- predict(model, test)
pred
t <- table(pred, test$Species)
t
# pred setosa versicolor virginica
# setosa 14 0 0
# versicolor 0 17 0
# virginica 0 1 13
sum(diag(t)) / nrow(test) # 0.9777778
#..
pred4 <- predict(model4, test)
pred4
t4 <- table(pred4, test$Species)
sum(diag(t4)) / nrow(test) # 0.9777778
# ntree, mtry 최적 값 얻기
ntree <- c(400, 500, 600)
mtry <- c(2:4)
param <- data.frame(n=ntree, m=mtry)
for(i in param$n){
cat('ntree : ', i, '\n')
for(j in param$m){
cat('mtree : ', j, '\n')
model_ir = randomForest(Species ~ ., data=iris, ntree=i, mtry=j, na.action = na.omit)
print(model_ir)
}
}