首页 > 解决方案 > 根据 R 中另一个数据帧的分组变量和日期,将 NA 替换为一个数据帧中的因子

问题描述

我正在使用两个不同的数据框,它们的结构不同但包含相同的信息。这是第一个模拟数据框:-

UserId<-c("n4RhHC", "k6qjRM", "nPUYOw", "Bm77mQ", "i95C1a", "iX45wN", 
          "n4RhHC", "k6qjRM", "nPUYOw", "Bm77mQ", "i95C1a", "iX45wN", "n4RhHC", 
          "k6qjRM", "nPUYOw", "Bm77mQ", "i95C1a", "iX45wN", "n4RhHC", "k6qjRM", 
          "nPUYOw", "Bm77mQ", "i95C1a", "iX45wN", "n4RhHC", "k6qjRM", "nPUYOw", 
          "Bm77mQ", "i95C1a", "iX45wN", "n4RhHC", "k6qjRM", "nPUYOw", "Bm77mQ", 
          "i95C1a", "iX45wN", "n4RhHC", "k6qjRM", "nPUYOw", "Bm77mQ", "i95C1a", 
          "iX45wN", "n4RhHC", "k6qjRM", "nPUYOw", "Bm77mQ", "i95C1a", "iX45wN", 
          "n4RhHC", "k6qjRM", "nPUYOw", "Bm77mQ", "i95C1a", "iX45wN", "n4RhHC", 
          "k6qjRM", "nPUYOw", "Bm77mQ", "i95C1a", "iX45wN", "n4RhHC", "k6qjRM", 
          "nPUYOw", "Bm77mQ", "i95C1a", "iX45wN", "n4RhHC", "k6qjRM", "nPUYOw", 
          "Bm77mQ", "i95C1a", "iX45wN", "n4RhHC", "k6qjRM", "nPUYOw", "Bm77mQ", 
          "i95C1a", "iX45wN", "n4RhHC", "k6qjRM", "nPUYOw", "Bm77mQ", "i95C1a", 
          "iX45wN", "n4RhHC", "k6qjRM", "nPUYOw", "Bm77mQ", "i95C1a", "iX45wN"
)


eventDateTime<-structure(c(1420081869.97425, 1420172875.75246, 1420267947.2653, 
                           1420368834.57653, 1420424712.65942, 1420541210.43439, 1420629609.9716, 
                           1420703746.46464, 1420788777.7267, 1420850669.16688, 1420943298.10164, 
                           1421028427.25171, 1421136879.38698, 1421210193.28063, 1421313257.14934, 
                           1421387900.60726, 1421483801.11956, 1421582050.3433, 1421642017.51975, 
                           1421745585.63356, 1421838779.26598, 1421893964.55692, 1421999352.30669, 
                           1422063023.98015, 1422155543.93289, 1422247080.1288, 1422317378.46239, 
                           1422419719.15975, 1422527170.64454, 1422590703.07666, 1422683225.86099, 
                           1422774701.24366, 1422856520.98446, 1422929644.60038, 1423043742.52736, 
                           1423123277.76309, 1423215111.16198, 1423271863.16464, 1423384864.31287, 
                           1423457767.05536, 1423561864.87991, 1423640753.00037, 1423733022.69533, 
                           1423809491.16866, 1423894883.88586, 1423992500.18921, 1424045807.90794, 
                           1424151816.33881, 1424249235.95351, 1424333926.00324, 1424411033.16768, 
                           1424514004.2494, 1424582125.79503, 1424660175.24237, 1424739053.33484, 
                           1424826696.93812, 1424922462.93775, 1425017605.00017, 1425110198.6193, 
                           1425185575.06409, 1425293836.23993, 1425353483.6657, 1425447031.63937, 
                           1425527959.44992, 1425628117.60418, 1425697546.32493, 1425793473.15473, 
                           1425892304.62097, 1425949239.4667, 1426069813.88146, 1426133047.95091, 
                           1426241063.82313, 1426306176.72673, 1426392019.07701, 1426484578.37379, 
                           1426588942.96811, 1426674139.46513, 1426740047.54828, 1426843180.25419, 
                           1426937498.69748, 1427001177.28974, 1427099580.63412, 1427172479.75674, 
                           1427255655.21296, 1427360706.16479, 1427423156.30542, 1427531520.43681, 
                           1427592457.09099, 1427684205.1038, 1427766190.74919), class = c("POSIXct", 
                                                                                           "POSIXt"), tzone = "")

eventOutcome<-structure(c(1L, 3L, 2L, 4L, 3L, 1L, 1L, 4L, 2L, 1L, 4L, 4L, 4L, 
                          1L, 1L, 1L, 3L, 2L, 1L, 1L, 3L, 3L, 3L, 2L, 2L, 2L, 4L, 3L, 4L, 
                          2L, 2L, 4L, 3L, 3L, 4L, 3L, 1L, 2L, 2L, 1L, 3L, 3L, 2L, 3L, 2L, 
                          1L, 4L, 2L, 4L, 1L, 4L, 3L, 3L, 4L, 1L, 4L, 2L, 4L, 1L, 3L, 2L, 
                          3L, 3L, 1L, 1L, 2L, 2L, 2L, 4L, 4L, 1L, 1L, 1L, 2L, 1L, 3L, 2L, 
                          3L, 3L, 2L, 3L, 3L, 4L, 1L, 1L, 4L, 2L, 4L, 2L, 1L), .Label = c("Outcome_A", 
                                                                                          "Outcome_B", "Outcome_C", "Outcome_D"), class = c("variable", 
                                                                                                                                            "factor"), varname = "Factor")
df<-data.frame(UserId,eventDateTime,eventOutcome)
df<-df%>%arrange(UserId,eventDateTime)


head(df)
#  UserId       eventDateTime eventOutcome
#1 Bm77mQ 2015-01-04 10:53:54    Outcome_D
#2 Bm77mQ 2015-01-10 00:44:29    Outcome_A
#3 Bm77mQ 2015-01-16 05:58:20    Outcome_A
#4 Bm77mQ 2015-01-22 02:32:44    Outcome_C
#5 Bm77mQ 2015-01-28 04:35:19    Outcome_C
#6 Bm77mQ 2015-02-03 02:14:04    Outcome_C

这是另一个数据框:-

UserId<-c("n4RhHC","n4RhHC","n4RhHC","nPUYOw","nPUYOw","nPUYOw","iX45wN","iX45wN","iX45wN")

Firstime<-c("2015-01-01 03:11:09","2015-01-19 04:33:37","2015-02-18 08:47:15",
            "2015-01-21 11:12:59","2015-02-08 08:41:04","2015-03-04 05:30:31",   
            "2015-01-12 02:07:07","2015-02-05 08:01:17","2015-03-01 04:52:55")

Lasttime<-c("2015-01-13 08:14:39","2015-02-06 09:31:51","2015-02-24 00:50:53",
            "2015-02-02 05:55:20","2015-02-26 03:47:42","2015-03-16 05:42:58",
            "2015-01-24 01:30:23","2015-02-17 05:43:36","2015-03-13 10:04:23")

eventOutcome<-c("Outcome_D",NA, "Outcome_A",NA,NA,"Outcome_A","Outcome_B","Outcome_B",NA)



df2<-data.frame(UserId,Firstime,Lasttime,eventOutcome)

df2$Firstime<-lubridate::ymd_hms(df2$Firstime)
df2$Lasttime<-lubridate::ymd_hms(df2$Lasttime)


head(df2)
  UserId            Firstime            Lasttime eventOutcome
#1 n4RhHC 2015-01-01 03:11:09 2015-01-13 08:14:39    Outcome_D
#2 n4RhHC 2015-01-19 04:33:37 2015-02-06 09:31:51         <NA>
#3 n4RhHC 2015-02-18 08:47:15 2015-02-24 00:50:53    Outcome_A
#4 nPUYOw 2015-01-21 11:12:59 2015-02-02 05:55:20         <NA>
#5 nPUYOw 2015-02-08 08:41:04 2015-02-26 03:47:42         <NA>
#6 nPUYOw 2015-03-04 05:30:31 2015-03-16 05:42:58    Outcome_A

如您所见,df2 中有 NA 条目,我想用 df. 如何根据 UserId 和 Lasttime 列中的相应时间条目(也是 eventDateTime 中的匹配时间条目)用 df 中的相应 eventOutcome 填充 df2 中的 NA 字段?

例如,df2 中的第一个 NA 字段将替换为“Outcome_A”,第二个 NA 字段将替换为“Outcome_C”等等。我希望这是有道理的。

标签: rdplyrtimestampmatchna

解决方案


推荐阅读