首页 > 解决方案 > 过滤器和左连接

问题描述

我有以下数据集

#Data to be matched
Data <- data.frame(
  Register = c(141 ,565, 1411,501),
  Name = c("Steve","Rbin", "Hero", "Cloud"),
  Date = c("43103", "43709", "43948", "43988"))

#Mapping table
Map = data.frame(
  Register = c(141,141,141,565,565,565,1411,1411,1411,500,500,500),
  Name = c("Steve","Steve","Steve","Robin","Robin","Robin","Hero","Hero","Hero","Cloud","Cloud","Cloud"),
  Class = c("3A", "2F",   "1D",   "1A",   "2A",   "3C",   "1B","2A",  "3A","1A","2A","3A"),
  Gender = c("M","M","M","F","F","F","M","M","M","M","M","M"),
  StartDate = c("43526","43102","42986","42140","43081","43451","43345","43346","43903","42768", "43257","43954"),
  EndDate = c("43644","43513","43101","43060","43434","43415","42986","43485","43988","43159","43810","2958101"))

Data$Date = as.numeric(Data$Date); Map$StartDate = as.numeric(Map$StartDate); Map$EndDate = as.numeric(Map$EndDate)
Data$Date = as.Date(Data$Date, origin = "1899-12-30");Map$StartDate = as.Date(Map$StartDate, origin = "1899-12-30");Map$EndDate = as.Date(Map$EndDate, origin = "1899-12-30")
Data$Date = format(Data$Date, "%Y/%m/%d");Map$StartDate = format(Map$StartDate, "%Y/%m/%d");Map$EndDate = format(Map$EndDate, "%Y/%m/%d");

#Do a full left join and subset when Date is in between StartDate and EndDate
s <- subset(merge(Data, Map, all.x = TRUE), Date >= StartDate & Date <= EndDate)
Joined = merge(Data, s, all.x = TRUE)  # add unmatched rows
#Output
> Joined
  Register  Name       Date Class Gender  StartDate    EndDate
1      141 Steve 2018/01/03    2F      M 2018/01/02 2019/02/17
2      501 Cloud 2020/06/06  <NA>   <NA>       <NA>       <NA>
3      565  Rbin 2019/09/01  <NA>   <NA>       <NA>       <NA>
4     1411  Hero 2020/04/27    3A      M 2020/03/13 2020/06/06

由于无法匹配注册表,因此我将使用名称。这是我的逻辑:如果通过 Register 匹配后 Class 为 NA,则 Map 使用 Name,否则,一切都应该保持不变。

这将是我想要的输出:

#Desired Output
  Register  Name       Date Class Gender  StartDate    EndDate
1      141 Steve 2018/01/03    2F          M 2018/01/02 2019/02/17
2      501 Cloud 2020/02/02    3A          M 2020/05/03 9999/01/01
3      565  Rbin 2019/09/01    <NA>       <NA>   <NA>       <NA>
4     1411  Hero 2020/04/27    3A          M 2020/03/13  2020/06/06

这段代码应该让您大致了解我想要做什么

Joined1 = Joined %>%
  filter(is.na(Joined$Class),) %>%
  left_join(Joined, Map, 
            by = c("Name" = "Name"))

标签: r

解决方案


你可以这样进行

library(fuzzyjoin)
library(tidyverse)

Map$StartDate <- as.Date(Map$StartDate, format = "%Y/%m/%d")
Map$EndDate <- as.Date(Map$EndDate, format = "%Y/%m/%d")
Data$Date <- as.Date(Data$Date, format = "%Y/%m/%d")

Merge1 <- Data %>% fuzzy_left_join(Map, by = c("Register" = "Register", "Date" = "StartDate", "Date" = "EndDate"), 
                         match_fun = c(`==`, `>=`, `<=`)) 

Merge1 %>% filter(is.na(Register.y)) %>% 
  select(Register = Register.x, Name = Name.x, Date) %>% 
  fuzzy_left_join(Map, by = c("Name" = "Name", "Date" = "StartDate", "Date" = "EndDate"), 
                           match_fun = c(`==`, `>=`, `<=`)) %>% rbind(Merge1 %>% filter(!is.na(Register.y))) %>%
  select(Register = Register.x, Name = Name.x, Class, Gender, Date)

  Register  Name Class Gender       Date
1      565  Rbin  <NA>   <NA> 2019-09-01
2      501 Cloud    3A      M 2020-06-06
3      141 Steve    2F      M 2018-01-03
4     1411  Hero    3A      M 2020-04-27

推荐阅读