r - `$<-.data.frame`(`*tmp*`, Predict, value = c(`1` = 1L, `2` = 1L, : 替换有 3500 行,数据有 1500
问题描述
再会
我已经运行了一个带有调整的随机森林,并将预测添加到运行良好且没有问题的训练数据中。但是,当我尝试在测试数据集上运行随机森林模型时,出现上述错误。关于以下可能导致此问题的任何想法都是我的代码。感谢您对此的任何帮助。训练数据集确实有 3500 行,而测试将有 1500 行,因为数据集由 5000 行组成。
代码:
####Clearing the global environmnent
rm(list = ls())
##Setting the working directory
setwd("D:/Great Learning/Module 3 -Machine Learning/Project")
##Packages required to be loaded
install.packages("DataExplorer")
install.packages("xlsx")
##install.packages("magrittr")
install.packages("dplyr")
install.packages("tidyverse")
install.packages("mice")
install.packages("NbClust")
##Reading in the dataset
library(xlsx)
LoanModelRaw = read.xlsx("Thera Bank_Personal_Loan_Modelling-dataset- 1.xlsx",sheetName = "Bank_Personal_Loan_Modelling",header = T)
##LoanModelRaw = read.csv("Thera Bank_Personal_Loan_Modelling-dataset-1.csv", sep = ";",header = T)
##Viewing the dataset in R
View(LoanModelRaw)
dim(LoanModelRaw)
colnames(LoanModelRaw)
str(LoanModelRaw)
summary(LoanModelRaw)
nrow(LoanModelRaw)
attach(LoanModelRaw)
#Correcting column names
names(LoanModelRaw)[2] = "AgeInYears"
names(LoanModelRaw)[3] = "ExperienceInYears"
names(LoanModelRaw)[4] = "IncomeInKMonth"
names(LoanModelRaw)[5] = "ZIPCode"
names(LoanModelRaw)[6] = "FamilyMembers"
names(LoanModelRaw)[10] = "PersonalLoan"
names(LoanModelRaw)[11] = "SecuritiesAccount"
names(LoanModelRaw)[12] = "CDAccount"
colnames(LoanModelRaw)
#############################################################1 EDA of the data#######################################################
library(DataExplorer)
##introduce(LoanModelRaw)
plot_intro(LoanModelRaw)
plot_missing(LoanModelRaw)
##plot_bar(LoanModelRaw)
plot_histogram(LoanModelRaw)
create_report(LoanModelRaw)
?plot_boxplot
#Missing Value Treatment
library(mice)
sum(is.na(LoanModelRaw))
md.pattern(LoanModelRaw)
LoanModelRawImpute = mice(LoanModelRaw, m =5, method = 'pmm', seed = 1000)
LoanModelRawNoNa = complete(LoanModelRawImpute, 3)
md.pattern(LoanModelRawNoNa)
#Correcting negative experience
LoanModel = abs(LoanModelRawNoNa[2:14])
attach(LoanModel)
#View(LoanModel)
#summary(LoanModel)
#nrow(LoanModel)
#
LoanModel$Split = sample.split(LoanModel$PersonalLoan, SplitRatio = 0.7)
View(LoanModel)
LoanModelTrainRaw = subset(LoanModel,LoanModel$Split == TRUE)
LoanModelTestRaw = subset(LoanModel,LoanModel$Split == FALSE)
#Installing the packages for the running random forest
install.packages("randomForest")
install.packages("dplyr")
library(randomForest)
library(dplyr)
attach(LoanModelTrain)
str(LoanModelTrain)
#Need to exclude the split and move columns
LoanModelTrain = LoanModelTrainRaw[1:13]
LoanModelTest = LoanModelTestRaw[1:13]
LoanModelTrain = LoanModelTrain %>% select(IncomeInKMonth,Mortgage,ZIPCode,CCAvg,everything())
LoanModelTest = LoanModelTest %>% select(IncomeInKMonth,Mortgage,ZIPCode,CCAvg, everything())
head(LoanModelTrain)
head(LoanModelTest)
###Converting the data set to a factor variable in order to be read
#Train
fcol = c(5:13)
LoanModelTrain[,fcol] = lapply(LoanModelTrain[,fcol], factor)
str(LoanModelTrain)
nrow(LoanModelTrain)
#Test
fcol = c(5:13)
LoanModelTest[,fcol] = lapply(LoanModelTest[,fcol], factor)
str(LoanModelTest)
##Running the random forest
seed = 1000
set.seed(seed)
LoanModelTrainRF = randomForest(PersonalLoan ~ ., data = LoanModelTrain, ntree = 501, mtry = 10, nodesize = 10, importance = TRUE, do.trace = TRUE)
print(LoanModelTrainRF)
plot(LoanModelTrainRF)
importance(LoanModelTrainRF)
?randomForest
###Tuning the random Forest
set.seed(seed)
LoanModelTrain = LoanModelTrain %>% select(PersonalLoan,everything())
str(LoanModelTrain)
LoanModelTrainRFTuned = tuneRF(x = LoanModelTrain[,-c(1)],
y = PersonalLoan,
mtryStart = 10,
stepFactor = 1.5,
improve = 0.001,
trace = TRUE,
plot = TRUE,
doBest = TRUE,
importance = TRUE)
###Running refined random forest
LoanModelTrainRefinedRF = randomForest(PersonalLoan ~ ., data = LoanModelTrain, ntree = 95, mtry = 10, nodesize = 10, importance = TRUE, do.trace = TRUE)
print(LoanModelTrainRefinedRF)
plot(LoanModelTrainRefinedRF)
###Adding the prediction columns and probability columns
LoanModelTrain$Predict = predict(LoanModelTrainRefinedRF,data= LoanModelTrain, type = "class")
LoanModelTrain$Score = predict(LoanModelTrainRefinedRF,data= LoanModelTrain, type = "prob")
head(LoanModelTrain)
###Check the accuracy of the model
install.packages("caret")
library(caret)
caret::confusionMatrix(LoanModelTrain$PersonalLoan, LoanModelTrain$Predict)
###Run the model against the Test Data
str(LoanModelTest)
** LoanModelTest$Predict = predict(LoanModelTrainRefinedRF,data= LoanModelTest, type = "class") ** LoanModelTest$Score = predict(LoanModelTrainRefinedRF,data= LoanModelTest, type = "prob")
AgeInYears ExperienceInYears IncomeInKMonth ZIPCode FamilyMembers CCAvg Education
25 1 49 91107 4 1.6 1
45 19 34 90089 3 1.5 1
39 15 11 94720 1 1.0 1
35 9 100 94112 1 2.7 2
35 8 45 91330 4 1.0 2
37 13 29 92121 4 0.4 2
Mortgage PersonalLoan SecuritiesAccount CDAccount Online CreditCard Split
0 0 1 0 0 0 FALSE
0 0 1 0 0 0 FALSE
0 0 0 0 0 0 TRUE
0 0 0 0 0 0 TRUE
0 0 0 0 0 1 TRUE
155 0 0 0 1 0 TRUE
解决方案
尝试从简单的 glm 模型预测单个结果时,我遇到了同样的错误。在模型中,我使用“dataset$outcome”等格式指定了结果和预测变量。在“测试”集中(实际上只有一行观察,我将列命名为“结果”等。如果我从模型,而是指定“数据=数据集”,然后错误消失。所以这可能是如何调用对象的问题。
推荐阅读
- xml - Postgre SQL 数据转 XML 格式
- ios - 如何从 XCAsset 加载特定文件夹?
- javascript - 如何在反应中将全局 onerror 处理程序弹出到标签中,以捕获断开的链接?
- python - 使用python操作和替换第一个字符串并维护第二个字符串行
- python - 我们如何巧妙地将数据从文件夹转移到数据集
- python - Python脚本不适用于django shell
- javascript - 复选框依赖项 javascript
- c++ - 为什么派生类成员的初始化值在转换为基指针时不会丢失?
- haskell - 什么是协变函子?
- mongoose - mongoose find() 函数不返回整个对象,跳过空字段