首页 > 解决方案 > 如何在 R 中为 xgboost 创建混淆矩阵

问题描述

我已经在 R 中创建了我的 XGBoost 分类器,如下面的代码所示

#importing the dataset
XGBoostDataSet_Hr_Admin_8 <- read.csv("CompletedDataImputed_HR_Admin.csv")

#Use factor function to convert categorical data to numerical data
XGBoostDataSet_Hr_Admin_8$Salary = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Salary, levels =c('L','M', 'H', 'V'), labels =c(1,2,3,4)))
XGBoostDataSet_Hr_Admin_8$Rude_Behavior = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Rude_Behavior, levels=c('Y', 'M', 'N'), labels =c(1,2,3)))
XGBoostDataSet_Hr_Admin_8$Feeling_undervalued =as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Feeling_undervalued, levels=c('Y', 'M', 'N'), labels =c(1,2,3)))
XGBoostDataSet_Hr_Admin_8$Overall_satisfaction = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Overall_satisfaction, levels=c('Y', 'M', 'N'), labels =c(1,2,3)))
XGBoostDataSet_Hr_Admin_8$Raises_frozen = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Raises_frozen, levels=c('Y', 'M', 'N'), labels =c(1,2,3)))
XGBoostDataSet_Hr_Admin_8$Poor_Conditions = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Poor_Conditions, levels=c('Y', 'M', 'N'), labels =c(1,2,3)))
XGBoostDataSet_Hr_Admin_8$Growth_not_available = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Growth_not_available, levels=c('Y', 'M', 'N'), labels =c(1,2,3)))
XGBoostDataSet_Hr_Admin_8$Workplace_Conflict = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Workplace_Conflict, levels=c('Y', 'M', 'N'), labels =c(1,2,3)))
XGBoostDataSet_Hr_Admin_8$Employee_Turnover = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Employee_Turnover, levels=c('Y', 'N'), labels =c(1,0)))

#split the data in train dataset and test dataset
library(caTools)
split = sample.split(XGBoostDataSet_Hr_Admin_8$Employee_Turnover,SplitRatio = 0.7)
training_set8 = subset(XGBoostDataSet_Hr_Admin_8, split==TRUE)
test_set8 = subset(XGBoostDataSet_Hr_Admin_8, split==FALSE)

#fitting XGBoost to the Training Test
library(xgboost)
classifier9 = xgboost(data = as.matrix(training_set8[-10]), label = training_set8$Employee_Turnover, nrounds = 10)

现在,我需要为 XGBoost 创建一个混淆矩阵。

我在网上搜索过,不幸的是找不到解决方案。

谁能帮帮我。

提前致谢

标签: rxgboostxgbclassifier

解决方案


您可以使用该caret::confusionMatrix()功能,但您需要对输出进行一些处理。显然,您需要一个真实结果(测试数据集)的向量,以比较计算结果和真实结果:

library(xgboost)


#Use factor function to convert categorical data to numerical data
XGBoostDataSet_Hr_Admin_8$Salary = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Salary, levels =c('L','M', 'H', 'V'), labels =c(1,2,3,4)))
XGBoostDataSet_Hr_Admin_8$Rude_Behavior = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Rude_Behavior, levels=c('Y', 'M', 'N'), labels =c(1,2,3)))
XGBoostDataSet_Hr_Admin_8$Feeling_undervalued =as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Feeling_undervalued, levels=c('Y', 'M', 'N'), labels =c(1,2,3)))
XGBoostDataSet_Hr_Admin_8$Overall_satisfaction = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Overall_satisfaction, levels=c('Y', 'M', 'N'), labels =c(1,2,3)))
XGBoostDataSet_Hr_Admin_8$Raises_frozen = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Raises_frozen, levels=c('Y', 'M', 'N'), labels =c(1,2,3)))
XGBoostDataSet_Hr_Admin_8$Poor_Conditions = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Poor_Conditions, levels=c('Y', 'M', 'N'), labels =c(1,2,3)))
XGBoostDataSet_Hr_Admin_8$Growth_not_available = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Growth_not_available, levels=c('Y', 'M', 'N'), labels =c(1,2,3)))
XGBoostDataSet_Hr_Admin_8$Workplace_Conflict = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Workplace_Conflict, levels=c('Y', 'M', 'N'), labels =c(1,2,3)))
XGBoostDataSet_Hr_Admin_8$Employee_Turnover = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Employee_Turnover, levels=c('Y', 'N'), labels =c(1,0)))

# here ifelse 0 1
XGBoostDataSet_Hr_Admin_8$Employee_Turnover = ifelse(XGBoostDataSet_Hr_Admin_8$Employee_Turnover == 1,0,1)

library(caTools)


split = sample.split(XGBoostDataSet_Hr_Admin_8$Employee_Turnover,SplitRatio = 0.7)
training_set8 = subset(XGBoostDataSet_Hr_Admin_8, split==TRUE)
test_set8 = subset(XGBoostDataSet_Hr_Admin_8, split==FALSE)

bst <- xgboost(data = as.matrix(training_set8[,-10]), label = training_set8$Employee_Turnover, max_depth = 2,
               eta = 0.5, nthread = 2, nrounds = 5, objective = "binary:logistic")  

# you've to do your prediction here
pred <- predict(bst, as.matrix(test_set8[,-10]))

# and transform them in a 0 1 variable, you can choose the value to get 1
pred <-  as.numeric(pred > 0.5)

library(caret)
confusionMatrix(factor(pred),factor(test_set8$Employee_Turnover))

Confusion Matrix and Statistics

          Reference
Prediction  0  1
         0 67  2
         1  0 16

               Accuracy : 0.9765          
                 95% CI : (0.9176, 0.9971)
    No Information Rate : 0.7882          
    P-Value [Acc > NIR] : 4.626e-07       

                  Kappa : 0.9265          

 Mcnemar's Test P-Value : 0.4795          

            Sensitivity : 1.0000          
            Specificity : 0.8889          
         Pos Pred Value : 0.9710          
         Neg Pred Value : 1.0000          
             Prevalence : 0.7882          
         Detection Rate : 0.7882          
   Detection Prevalence : 0.8118          
      Balanced Accuracy : 0.9444          

       'Positive' Class : 0   

推荐阅读