首页 > 解决方案 > 在具有条件的变异函数中混合来自 2 个数据帧的值

问题描述

我有 2 个具有这些形式的数据框:

DF1 <- data.frame(
  idCarte = c('a', 'a', 'b', 'b', 'b'),
  idPack = c('1', '2', '2', '3', '3'),
  timeIn = c('10:00:02', '12:00:50', '11:40:00', '12:10:35', '15:15:00'),
  timeOut = c('12:00:00', '14:00:00', '11:50:00', '15:00:00', '16:00:00')
)

DF1
idCarte  idPack      timeIn   timeOut
      a      1     10:00:02  12:00:00
      a      2     12:00:50  14:00:00
      b      2     11:40:00  11:50:00
      b      3     12:10:35  15:00:00
      b      3     15:15:35  16:00:00
DF2 <- data.frame(
  idCarte = c('a', 'a', 'b', 'b', 'b'),
  idPack = c('1', '2', '3', '3', '2'),
  timeBetween = c('11:00:02', '13:00:50', '14:10:35', '15:20:00', '18:00:00')
)

DF2
idCarte  idPack    timeBetween
      a       1       11:00:02
      a       2       13:00:50
      b       3       14:10:35
      b       3       15:20:00
      b       2       18:00:00

我想得到这个结果

idCarte  idPack      timeIn   timeOut  timeBetween
      a      1     10:00:02  12:00:00     11:00:02 
      a      2     12:00:50  14:00:00     13:00:50
      b      2     11:40:00  11:50:00           NA
      b      3     12:10:35  15:00:00     14:10:35
      b      3     15:15:00  16:00:00     15:20:00

我可以用这样的 for 循环来做到这一点,但它真的很慢

for (i in 1:nrow(DF1)) {
  timeBetweenLocal <- DF2 %>%
    filter(
      idCarte == DF1[i,"idCarte"] &
      idPack == DF1[i,"idPack"] &
      timeBetween >= DF1[i,"timeIn"] &
      timeBetween <= DF1[i,"timeOut"]
    )
  if (nrow(timeBetweenLocal) > 0) {
    DF1[i, "timeBetween"] <- timeBetweenLocal[1, "timeBetween"]
  } else {
    DF1[i, "timeBetween"] <- NA
  }
}

我想以矢量化的方式使用 dplyr::mutate 来加快速度,但这似乎有点棘手。

DF1 %>%
  mutate (
    timeBetween = ifelse (
      nrow(DF2 %>%
             dplyr::filter(
               idCarte == .$idCarte &
               idPack == .$idPack &
               timeBetween >=.$timeIn &
               timeBetween <= .$timeOut
             )
      ) > 0,
      DF2 %>%
        dplyr::filter(
          idCarte == .$idCarte &
          idPack == .$idPack &
          timeBetween >=.$timeIn &
          timeBetween <= .$timeOut
        ),
      NA
    )
  )

# Error : Result must have length 4, not 0

我的问题是我需要测试匹配时间,因为有多个 idCarte, idPack 有人有想法矢量化这个算法吗?谢谢

标签: rfor-loopconditional-statementsdplyr

解决方案


这是一个使用left_joinand的解决方案case_whenleft_join可能导致重复的行,你可以使用na.omit或者filter(!duplicated(...))如果你想删除一些重复。

library(lubridate)
library(dplyr)

# Yours data
DF1 <- data.frame(stringsAsFactors = F,
  idCarte = c('a', 'a', 'b', 'b', 'b'),
  idPack = c('1', '2', '2', '3', '3'),
  timeIn = c('10:00:02', '12:00:50', '11:40:00', '12:10:35', '15:15:00'),
  timeOut = c('12:00:00', '14:00:00', '11:50:00', '15:00:00', '16:00:00')
)


DF2 <- data.frame(stringsAsFactors = F,
  idCarte = c('a', 'a', 'b', 'b', 'b'),
  idPack = c('1', '2', '3', '3', '2'),
  timeBetween = c('11:00:02', '13:00:50', '14:10:35', '15:20:00', '18:00:00')
)


# Solution with left_join lead to duplicate rows

df = left_join(x = DF1, y = DF2, by = c("idCarte"="idCarte","idPack"="idPack")) %>% 
  mutate(timeBetween = case_when(hms(timeBetween)>= hms(timeIn) & hms(timeBetween)<= hms(timeOut) ~ timeBetween,
                                  T ~ NA_character_
                                  )
         )

# The output
# 
#   idCarte idPack   timeIn  timeOut timeBetween
# 1       a      1 10:00:02 12:00:00    11:00:02
# 2       a      2 12:00:50 14:00:00    13:00:50
# 3       b      2 11:40:00 11:50:00        <NA>
# 4       b      3 12:10:35 15:00:00    14:10:35
# 5       b      3 12:10:35 15:00:00        <NA>
# 6       b      3 15:15:00 16:00:00        <NA>
# 7       b      3 15:15:00 16:00:00    15:20:00

推荐阅读