首页 > 解决方案 > 识别数据集中的预留扩展(ddply?)

问题描述

我有一个预订数据集,其中一些是原始预订的扩展。我正在尝试识别这些扩展名。

我一直在尝试在 ddply 或“for(数据中的行)”样式循环中执行此操作,但无论哪种情况,我都无法弄清楚如何使组合起作用。

这 3 项检查必须针对每隔一行组合进行。即第 1 行,检查第 2 行的入住/退房日期、建筑代码、访客电子邮件等。

for (i in datax) {
  #1:length(fdr.list)
datax %>% filter(datax$email == datax[i,]$email)

# filter <- datax[datax$email == datax[i]$email]

datax[i, ]$Extension <- ifelse(data[i, ]$StaysOrdered == 1, 0, #Initial filtering just to do less work
     ifelse(
        floor_date(datax[i, ]$checkOutDate)== floor_date(datax$checkInDate) &
        datax[i, ]$buildingcode == datax$buildingcode &
          ifelse(not(is.na(datax[i, ]$email)),ifelse(datax[i, ]$email == datax$email
        , 1, 0),0)))
}

我没有可重现的示例,因为我希望中间的逻辑会改变和/或扩展,因此我更多的是寻找可以扩展的代码库,而不是直接解决问题。

这涉及约 30,000 个预订,因此理想情况下,代码不会花费太长时间来排行。我不知道这将如何工作,但也许先过滤然后通过guestemail检查?

标签: r

解决方案


最终创建了一些独特的代码字符串并匹配它们。将它从一个小时减少到约 5 分钟的时间上的巨大差异是将结果放入预先填充的矩阵中,而不是直接放入 data.frame 中。

#Create unique codes to match; by three methods (email, lastname_cleaned, guest), one for check-out one for check-in
data$Extension_MatchCode_Email_Out <- paste(data$internaltitle, "-", floor_date(data$checkOutDate,"day"), "-", data$email)
data$Extension_MatchCode_Email_In <- paste(data$internaltitle, "-", floor_date(data$checkInDate,"day"), "-", data$email)
data$Extension_MatchCode_Name_Out <- paste(data$internaltitle, "-", floor_date(data$checkOutDate,"day"), "-", data$lastname_cleaned)
data$Extension_MatchCode_Name_In <- paste(data$internaltitle, "-", floor_date(data$checkInDate,"day"), "-", data$lastname_cleaned)
data$Extension_MatchCode_Guest_Out <- paste(data$internaltitle, "-", floor_date(data$checkOutDate,"day"), "-", data$guest)
data$Extension_MatchCode_Guest_In <- paste(data$internaltitle, "-", floor_date(data$checkInDate,"day"), "-", data$guest)

#Prepopulate results matices. We don't want to be writing into the dataframe directly else it will be ~8x slower
matchval_email <- rep(NA, nrow(data))
matchval_lastname <- rep(NA, nrow(data))
matchval_guest <- rep(NA, nrow(data))

#For loop; for each row check if there is a match in another row's 'in' column to the row's 'out' column. Every 100 loops, print progress and time stamp.
for (i in 1:nrow(data)) {
ifelse(i %% 100 == 0, print(paste(i, "-", Sys.time())),"")

#Match by customer email method
matchval_email[i] <- ifelse(is.na(data[i, ]$email), 0, #Check if email is blank, if so, skip
                              ifelse(data[i, ]$Stays_Email <2, 0, #Check if stays for this email is <2, if so, skip
                              fmatch(data[i, ]$Extension_MatchCode_Email_In, data$Extension_MatchCode_Email_Out, nomatch = 0))) #find first occurance of a match for out to in code.
#Match by last name method
matchval_lastname[i] <- ifelse(is.na(data[i, ]$lastname_cleaned), 0,
                              ifelse(data[i, ]$Stays_LastName_Cleaned <2, 0,
                              fmatch(data[i, ]$Extension_MatchCode_Name_In, data$Extension_MatchCode_Name_Out, nomatch = 0)))
#Match by guest code method
matchval_guest[i] <- ifelse(is.na(data[i, ]$guest), 0,
                              ifelse(data[i, ]$Stays <2, 0,                               
                              fmatch(data[i, ]$Extension_MatchCode_Guest_In, data$Extension_MatchCode_Guest_Out, nomatch = 0)))

}


#Move matrix results into dataframe once
data$matchval_email <- matchval_email
data$matchval_lastname <- matchval_lastname
data$matchval_guest <- matchval_guest

推荐阅读