首页 > 解决方案 > 我可以得到以下嵌套函数的整洁解决方案吗

问题描述

该函数在包含 1000 到 20,000 个案例的 df 上运行良好,但除此之外,它需要数小时(5 小时以上),现在我有一个 57635985 观察的 df

假设一个像这样的df:


d<-structure(list(ReviewType= c("Review","Review","Review","Correction","Correction","Review","Review","Review","Review","Review","Correction","Correction","Deficiency","Correction","Correction", 
                                "Correction", "Deficiency", "Deficiency", "Correction","Correction","Deficiency","Correction"),
                  Submissiondate= c("2020-08-29 04:32:00","2020-08-28 04:31:00","2020-08-26 04:31:00","2020-08-25 04:31:00","2020-08-24 04:31:00","2020-08-23 04:31:00","2020-08-22 04:31:00","2020-08-21 04:31:00","2020-08-20 04:31:00","2020-08-19 04:31:00",
                                    "2020-09-27 04:31:00","2020-09-27 03:52:59","2020-09-28 17:30:00","2020-09-29 14:01:00",
                                    "2020-09-05 03:00:00","2020-09-05 03:51:00", "2020-09-03 23:59:49",
                                    "2020-09-02 00:03:54","2020-09-01 00:04:48","2020-10-01 04:31:00","2020-10-11 04:31:00","2020-10-21 04:31:00"),
                  CaseNo= c("124","123","125","121","121","125","123","123","123","123","123","123","123","125","123","123","123","124","123","127","127","127")), class = "data.frame", row.names = c(NA, -22L))


d<-d%>%arrange(CaseNo,Submissiondate)

下面的代码试图查看每个案例编号,只要状态在每周结束时保持在更正状态,它将在随后的所有周内计入统计信息,直到状态更改为任何内容:

d <- d %>% mutate(Submissiondate = as.Date(Submissiondate), 
                  weekday = wday(Submissiondate), 
                  week.end = Submissiondate + 7 - weekday) 


#1 End state for each case and week
EndStates <- d %>% 
  group_by(CaseNo, week.end) %>% 
  summarize(WeekEndState = last(ReviewType)) %>% ungroup()

#2 Get unique values of Case/Week.end/ReviewType
chk <- d %>%
  select(CaseNo, week.end, ReviewType) %>%
  distinct()

#3 Add the EndStates and count if the week had a correction AND
#   the week ended as a Correction
chk <- left_join(chk, EndStates, by = c("CaseNo", "week.end"))

cor_df <- as.data.frame(matrix(ncol=length(unique(chk$CaseNo))+1, 
                               nrow=length(unique(chk$week.end))))
names(cor_df) <- c("week.end", unique(chk$CaseNo))
cor_df$week.end <- unique(chk$week.end)
for(i in 1:nrow(cor_df)){
      for(j in 2:ncol(cor_df)){
            this_CaseNo_idx = chk$CaseNo == strtoi(names(cor_df)[j])
            idx =  this_CaseNo_idx & 
                  chk$week.end == max(chk$week.end[chk$week.end <= cor_df$week.end[i] &
                                                         this_CaseNo_idx])
            if (sum(idx) < 1){
                  cor_df[i, j] = 0
            }
            else{
                  
                  cor_df[i, j] = max(ifelse(chk$WeekEndState[idx] == "Correction" & 
                                              chk$ReviewType[idx] == "Correction", 
                                            1, 0))
            }
            
      }
}
cor_df$asw <- rowSums(cor_df[,2:ncol(cor_df)])
cor_df <- cor_df[order(cor_df$week.end),]
data.frame(week.end=cor_df$week.end, cor_df$asw)

预期输出:

week.end cor_df.asw
1 2020-08-22          0
2 2020-08-29          1
3 2020-09-05          2
4 2020-10-03          3
5 2020-10-17          2
6 2020-10-24          3

任何指导表示赞赏。

标签: r

解决方案


EndStates我可以从数据框开始带你到那里。我不确定它是否会更快。因为dplyr一次对所有列进行处理(而不是按顺序向下),所以我仍然需要一个while()循环来在缺少的几周内完成一些填充。也许更好的dplyr人会提供替代方案。

library(dplyr)
library(tidyr)
cor_df2 <- EndStates %>%
  mutate(count = as.numeric(WeekEndState == "Correction")) %>% 
  select(-WeekEndState) %>% 
  pivot_wider(id_cols="week.end", names_from="CaseNo", values_from="count") %>% 
  arrange(week.end) %>%
  mutate(across(-week.end, function(x)case_when(is.na(x) & week.end == min(week.end) ~ 0, TRUE ~ x)))

while(any(is.na(cor_df2))){
  cor_df2 <- cor_df2 %>% mutate(across(-week.end, function(x)case_when(is.na(x)~lag(x), TRUE ~ x)))
}  
cor_df2 <- cor_df2 %>%   
  mutate(asw = rowSums(.[-1])) %>% 
  select(week.end, asw)

推荐阅读