r - 如何在 R 中为 xgboost 创建混淆矩阵
问题描述
我已经在 R 中创建了我的 XGBoost 分类器,如下面的代码所示
#importing the dataset
XGBoostDataSet_Hr_Admin_8 <- read.csv("CompletedDataImputed_HR_Admin.csv")
#Use factor function to convert categorical data to numerical data
XGBoostDataSet_Hr_Admin_8$Salary = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Salary, levels =c('L','M', 'H', 'V'), labels =c(1,2,3,4)))
XGBoostDataSet_Hr_Admin_8$Rude_Behavior = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Rude_Behavior, levels=c('Y', 'M', 'N'), labels =c(1,2,3)))
XGBoostDataSet_Hr_Admin_8$Feeling_undervalued =as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Feeling_undervalued, levels=c('Y', 'M', 'N'), labels =c(1,2,3)))
XGBoostDataSet_Hr_Admin_8$Overall_satisfaction = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Overall_satisfaction, levels=c('Y', 'M', 'N'), labels =c(1,2,3)))
XGBoostDataSet_Hr_Admin_8$Raises_frozen = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Raises_frozen, levels=c('Y', 'M', 'N'), labels =c(1,2,3)))
XGBoostDataSet_Hr_Admin_8$Poor_Conditions = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Poor_Conditions, levels=c('Y', 'M', 'N'), labels =c(1,2,3)))
XGBoostDataSet_Hr_Admin_8$Growth_not_available = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Growth_not_available, levels=c('Y', 'M', 'N'), labels =c(1,2,3)))
XGBoostDataSet_Hr_Admin_8$Workplace_Conflict = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Workplace_Conflict, levels=c('Y', 'M', 'N'), labels =c(1,2,3)))
XGBoostDataSet_Hr_Admin_8$Employee_Turnover = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Employee_Turnover, levels=c('Y', 'N'), labels =c(1,0)))
#split the data in train dataset and test dataset
library(caTools)
split = sample.split(XGBoostDataSet_Hr_Admin_8$Employee_Turnover,SplitRatio = 0.7)
training_set8 = subset(XGBoostDataSet_Hr_Admin_8, split==TRUE)
test_set8 = subset(XGBoostDataSet_Hr_Admin_8, split==FALSE)
#fitting XGBoost to the Training Test
library(xgboost)
classifier9 = xgboost(data = as.matrix(training_set8[-10]), label = training_set8$Employee_Turnover, nrounds = 10)
现在,我需要为 XGBoost 创建一个混淆矩阵。
我在网上搜索过,不幸的是找不到解决方案。
谁能帮帮我。
提前致谢
解决方案
您可以使用该caret::confusionMatrix()
功能,但您需要对输出进行一些处理。显然,您需要一个真实结果(测试数据集)的向量,以比较计算结果和真实结果:
library(xgboost)
#Use factor function to convert categorical data to numerical data
XGBoostDataSet_Hr_Admin_8$Salary = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Salary, levels =c('L','M', 'H', 'V'), labels =c(1,2,3,4)))
XGBoostDataSet_Hr_Admin_8$Rude_Behavior = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Rude_Behavior, levels=c('Y', 'M', 'N'), labels =c(1,2,3)))
XGBoostDataSet_Hr_Admin_8$Feeling_undervalued =as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Feeling_undervalued, levels=c('Y', 'M', 'N'), labels =c(1,2,3)))
XGBoostDataSet_Hr_Admin_8$Overall_satisfaction = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Overall_satisfaction, levels=c('Y', 'M', 'N'), labels =c(1,2,3)))
XGBoostDataSet_Hr_Admin_8$Raises_frozen = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Raises_frozen, levels=c('Y', 'M', 'N'), labels =c(1,2,3)))
XGBoostDataSet_Hr_Admin_8$Poor_Conditions = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Poor_Conditions, levels=c('Y', 'M', 'N'), labels =c(1,2,3)))
XGBoostDataSet_Hr_Admin_8$Growth_not_available = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Growth_not_available, levels=c('Y', 'M', 'N'), labels =c(1,2,3)))
XGBoostDataSet_Hr_Admin_8$Workplace_Conflict = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Workplace_Conflict, levels=c('Y', 'M', 'N'), labels =c(1,2,3)))
XGBoostDataSet_Hr_Admin_8$Employee_Turnover = as.numeric(factor(XGBoostDataSet_Hr_Admin_8$Employee_Turnover, levels=c('Y', 'N'), labels =c(1,0)))
# here ifelse 0 1
XGBoostDataSet_Hr_Admin_8$Employee_Turnover = ifelse(XGBoostDataSet_Hr_Admin_8$Employee_Turnover == 1,0,1)
library(caTools)
split = sample.split(XGBoostDataSet_Hr_Admin_8$Employee_Turnover,SplitRatio = 0.7)
training_set8 = subset(XGBoostDataSet_Hr_Admin_8, split==TRUE)
test_set8 = subset(XGBoostDataSet_Hr_Admin_8, split==FALSE)
bst <- xgboost(data = as.matrix(training_set8[,-10]), label = training_set8$Employee_Turnover, max_depth = 2,
eta = 0.5, nthread = 2, nrounds = 5, objective = "binary:logistic")
# you've to do your prediction here
pred <- predict(bst, as.matrix(test_set8[,-10]))
# and transform them in a 0 1 variable, you can choose the value to get 1
pred <- as.numeric(pred > 0.5)
library(caret)
confusionMatrix(factor(pred),factor(test_set8$Employee_Turnover))
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 67 2
1 0 16
Accuracy : 0.9765
95% CI : (0.9176, 0.9971)
No Information Rate : 0.7882
P-Value [Acc > NIR] : 4.626e-07
Kappa : 0.9265
Mcnemar's Test P-Value : 0.4795
Sensitivity : 1.0000
Specificity : 0.8889
Pos Pred Value : 0.9710
Neg Pred Value : 1.0000
Prevalence : 0.7882
Detection Rate : 0.7882
Detection Prevalence : 0.8118
Balanced Accuracy : 0.9444
'Positive' Class : 0