首页 > 解决方案 > `$<-.data.frame`(`*tmp*`, Predict, value = c(`1` = 1L, `2` = 1L, : 替换有 3500 行,数据有 1500

问题描述

再会

我已经运行了一个带有调整的随机森林,并将预测添加到运行良好且没有问题的训练数据中。但是,当我尝试在测试数据集上运行随机森林模型时,出现上述错误。关于以下可能导致此问题的任何想法都是我的代码。感谢您对此的任何帮助。训练数据集确实有 3500 行,而测试将有 1500 行,因为数据集由 5000 行组成。

代码:

####Clearing the global environmnent
rm(list = ls())

##Setting the working directory
setwd("D:/Great Learning/Module 3 -Machine Learning/Project")


##Packages required to be loaded
install.packages("DataExplorer")
install.packages("xlsx")
##install.packages("magrittr")
install.packages("dplyr")
install.packages("tidyverse")
install.packages("mice")
install.packages("NbClust")

##Reading in the dataset
library(xlsx)
LoanModelRaw = read.xlsx("Thera Bank_Personal_Loan_Modelling-dataset- 1.xlsx",sheetName = "Bank_Personal_Loan_Modelling",header = T)
##LoanModelRaw = read.csv("Thera Bank_Personal_Loan_Modelling-dataset-1.csv", sep = ";",header = T)

##Viewing the dataset in R
View(LoanModelRaw)
dim(LoanModelRaw)
colnames(LoanModelRaw)
str(LoanModelRaw)
summary(LoanModelRaw)
nrow(LoanModelRaw)
attach(LoanModelRaw)

#Correcting column names
names(LoanModelRaw)[2] = "AgeInYears" 
names(LoanModelRaw)[3] = "ExperienceInYears"
names(LoanModelRaw)[4] = "IncomeInKMonth"
names(LoanModelRaw)[5] = "ZIPCode"
names(LoanModelRaw)[6] = "FamilyMembers"
names(LoanModelRaw)[10] = "PersonalLoan"
names(LoanModelRaw)[11] = "SecuritiesAccount"
names(LoanModelRaw)[12] = "CDAccount" 

colnames(LoanModelRaw)

#############################################################1 EDA of the data#######################################################

library(DataExplorer)
##introduce(LoanModelRaw)
plot_intro(LoanModelRaw)
plot_missing(LoanModelRaw)
##plot_bar(LoanModelRaw)
plot_histogram(LoanModelRaw)
create_report(LoanModelRaw)

?plot_boxplot

#Missing Value Treatment
library(mice)
sum(is.na(LoanModelRaw))
md.pattern(LoanModelRaw)
LoanModelRawImpute = mice(LoanModelRaw, m =5, method = 'pmm', seed = 1000)
LoanModelRawNoNa = complete(LoanModelRawImpute, 3)
md.pattern(LoanModelRawNoNa)

#Correcting negative experience
LoanModel = abs(LoanModelRawNoNa[2:14])
attach(LoanModel)
#View(LoanModel)
#summary(LoanModel)
#nrow(LoanModel)
#
LoanModel$Split = sample.split(LoanModel$PersonalLoan, SplitRatio = 0.7)
View(LoanModel)
LoanModelTrainRaw = subset(LoanModel,LoanModel$Split == TRUE)
LoanModelTestRaw = subset(LoanModel,LoanModel$Split == FALSE)

#Installing the packages for the running random forest
install.packages("randomForest")
install.packages("dplyr")
library(randomForest)
library(dplyr)
attach(LoanModelTrain)
str(LoanModelTrain)

#Need to exclude the split and move columns
LoanModelTrain = LoanModelTrainRaw[1:13]
LoanModelTest = LoanModelTestRaw[1:13]
LoanModelTrain = LoanModelTrain %>% select(IncomeInKMonth,Mortgage,ZIPCode,CCAvg,everything())
LoanModelTest = LoanModelTest %>% select(IncomeInKMonth,Mortgage,ZIPCode,CCAvg, everything())
head(LoanModelTrain)
head(LoanModelTest)

###Converting the data set to a factor variable in order to be read
#Train
fcol = c(5:13)
LoanModelTrain[,fcol] = lapply(LoanModelTrain[,fcol], factor)
str(LoanModelTrain)
nrow(LoanModelTrain)

#Test
fcol = c(5:13)
LoanModelTest[,fcol] = lapply(LoanModelTest[,fcol], factor)
str(LoanModelTest)

##Running the random forest
seed = 1000
set.seed(seed)
LoanModelTrainRF = randomForest(PersonalLoan ~ ., data = LoanModelTrain, ntree = 501, mtry = 10, nodesize = 10, importance = TRUE, do.trace = TRUE)
print(LoanModelTrainRF)
plot(LoanModelTrainRF)
importance(LoanModelTrainRF)
?randomForest

###Tuning the random Forest
set.seed(seed)
LoanModelTrain = LoanModelTrain %>% select(PersonalLoan,everything())
str(LoanModelTrain)
LoanModelTrainRFTuned = tuneRF(x = LoanModelTrain[,-c(1)], 
                               y = PersonalLoan,
                               mtryStart = 10,
                               stepFactor = 1.5,
                               improve = 0.001,
                               trace = TRUE,
                               plot = TRUE,
                               doBest = TRUE,
                               importance = TRUE)

###Running refined random forest
LoanModelTrainRefinedRF = randomForest(PersonalLoan ~ ., data = LoanModelTrain, ntree = 95, mtry = 10, nodesize = 10, importance = TRUE, do.trace = TRUE)
print(LoanModelTrainRefinedRF)
plot(LoanModelTrainRefinedRF)


###Adding the prediction columns and probability columns 
LoanModelTrain$Predict = predict(LoanModelTrainRefinedRF,data= LoanModelTrain, type = "class")
LoanModelTrain$Score = predict(LoanModelTrainRefinedRF,data= LoanModelTrain, type = "prob")
head(LoanModelTrain)

###Check the accuracy of the model
install.packages("caret")
library(caret)

caret::confusionMatrix(LoanModelTrain$PersonalLoan, LoanModelTrain$Predict)


###Run the model against the Test Data
str(LoanModelTest)

** LoanModelTest$Predict = predict(LoanModelTrainRefinedRF,data= LoanModelTest, type = "class") ** LoanModelTest$Score = predict(LoanModelTrainRefinedRF,data= LoanModelTest, type = "prob")

AgeInYears  ExperienceInYears   IncomeInKMonth  ZIPCode FamilyMembers   CCAvg   Education
25  1   49  91107   4   1.6 1
45  19  34  90089   3   1.5 1
39  15  11  94720   1   1.0 1
35  9   100 94112   1   2.7 2
35  8   45  91330   4   1.0 2
37  13  29  92121   4   0.4 2

Mortgage    PersonalLoan    SecuritiesAccount   CDAccount   Online  CreditCard  Split
0   0   1   0   0   0   FALSE
0   0   1   0   0   0   FALSE
0   0   0   0   0   0   TRUE
0   0   0   0   0   0   TRUE
0   0   0   0   0   1   TRUE
155 0   0   0   1   0   TRUE

标签: rrandom-forestsupervised-learning

解决方案


尝试从简单的 glm 模型预测单个结果时,我遇到了同样的错误。在模型中,我使用“dataset$outcome”等格式指定了结果和预测变量。在“测试”集中(实际上只有一行观察,我将列命名为“结果”等。如果我从模型,而是指定“数据=数据集”,然后错误消失。所以这可能是如何调用对象的问题。


推荐阅读