首页 > 解决方案 > dplyr 用于大小写控制匹配

问题描述

我熟悉其他 R 包,例如MatchIt,但是,我dplyr主要在寻找一个解决方案。我没有尝试任何解决方案,因为它超出了我目前的技能水平。

假设我有两组患有相同疾病的患者接受两种治疗中的一种。p$group=="A"都收到了p$total.ssap$total.ev同时p$group=="B"收到了p$total.ssa,排他的。

> head(p)
  group total.ssa total.ev age WHO
1     A       347     3240  49   3
2     A       347     3240  54   3
3     A       456     4260  62   3
4     A       456     4260  55   3
5     A       163     1520  61   3
6     A       258     2410  55   3

假设我想创建一个:的1 : 1 匹配。是应该匹配的组,但条件应该是p$group=="A"p$group=="B"p$group=="A"p$group=="B"

预期输出是一个新data frame的,包括两组之间匹配的患者的子集,p$group=="A"以及p$group=="B"

预期产出

  group total.ssa       total.ev     age       WHO
1     A       347           3240     49        3
2     B       347+/-200     0        49+/-3    3
3     A       456           4260     62        3
4     B       456+/-200     0        62+/-3    3
5     A       163           1520     61        2
6     B       163+/-200     0        61+/-3    2

我的数据是

p <- structure(list(group = c("A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B", "B", "B", 
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", 
"B", "B", "B", "B", "B", "A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "B", 
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", 
"B", "B"), total.ssa = c(347, 347, 456, 456, 163, 258, 1000, 
1000, 433, 433, 433, 433, 115, 115, 420, 580, 300, 300, 60, 220, 
60, 180, 220, 100, 100, 100, 220, 180, 180, 220, 60, NA, 60, 
60, 30, 120, 90, 360, 300, 60, 180, 90, 60, 330, 210, 180, 90, 
480, 60, 870, 240, 360, 210, 360, 120, 240, 1080, 270, 690, 870, 
270, 960, 110, 750, 150, 990, 30, 70, NA, 110, 710, 70, 870, 
510, 390), total.ev = c(3240L, 3240L, 4260L, 4260L, 1520L, 2410L, 
9330L, 9330L, 4040L, 4040L, 4040L, 4040L, 1070L, 1070L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 3230L, 205L, 840L, 160L, 3060L, 1920L, 
1470L, 700L, 2140L, 700L, 8160L, 1980L, 3300L, 1680L, 3360L, 
1040L, 2130L, 4950L, 2130L, 6310L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), age = c(49, 54, 62, 55, 
61, 55, 52, 55, 48, 36, 44, 56, 72, 80, 62, 83, 41, 67, 55, 68, 
26, 45, 52, 87, 51, 61, 51, 84, 74, 72, 63, 48, 54, 34, 28, 55, 
31, 47, 56, 65, 75, 55, 70, 48, 40, 60, 36, 30, 43, 54, 43, 70, 
58, 73, 43, 60, 44, 70, 53, 84, 56, 46, 71, 68, 65, 84, 68, 59, 
86, 64, 51, 63, 56, 43, 52), WHO = structure(c(3L, 3L, 3L, 3L, 
3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 1L, 3L, 1L, 3L, 3L, 2L, 
3L, 1L, 1L, 2L, 1L, 1L, 3L, 2L, 1L, 1L, 3L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 3L, 2L, 3L, 2L, 1L, 2L, 3L, 2L, 2L, 2L, 1L, 
3L, 3L, 3L, 2L, 3L, 2L, 3L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 2L, 
1L, 1L, 1L, 3L, 2L, 3L, 1L), .Label = c("1", "2", "3", "4"), class = "factor")), row.names = c(NA, 
75L), class = "data.frame")

标签: rdataframedplyrmatch

解决方案


您可以尝试以下方法:

library(dplyr)
#Make 2 separate dataframes
A_data <- p %>% filter(group == 'A') %>% mutate(row = row_number())
B_data <- p %>% filter(group == 'B') %>% mutate(row = row_number())


#Join them by WHO
combine_data <- A_data %>%
  full_join(B_data, by = 'WHO') %>%
  filter(abs(age.x - age.y) <= 3, abs(total.ssa.x - total.ssa.y) <= 200)

#Get the row index and slice individual data
A_data %>%
  slice(combine_data$row.x) %>%
bind_rows(B_data %>%
  slice(combine_data$row.y)) %>%
  #This is just to rearrange the data for viewing the rows in pairs
  slice(c(rbind(1:(n()/2), ((n()/2)+1):n())))

#   group total.ssa total.ev age WHO row
#1      A       347     3240  49   3   1
#2      B       220        0  51   3  13
#3      A       347     3240  54   3   2
#4      B       220        0  51   3  13
#5      A       163     1520  61   3   5
#6      B        60        0  63   3  17
#7      A       163     1520  61   3   5
#8      B        70        0  63   3  38
#9      A       258     2410  55   3   6
#10     B        60        0  55   3   5
#...
#...

推荐阅读