r - 基于不同变量的交互式连接
问题描述
我有两个数据框如下:
df<-data.frame(
id=c("1-1","2-2","3-3","4-4","5-5","6-6"),
identifer=c(1,2,3,4,5,6),
key=c("A","B","C","D","E","F"),
product=c("productA","productB","productC","productD","productE","productF"),
ingredient=c("ingredientA","ingredientB","ingredientC","ingredientD","ingredientE","ingredientF"),
DF=c("Tablet","Powder","Suspension","System","Capsule","Capsule"))
df_2<-data.frame(
identifer=c(1,2,2,3,4,6),
key=c("A","B","B","C","D","F"),
product=c("productA","productB","productB","productCC","productDD","productFF"),
ingredient=c("ingredientA","ingredientBB","ingredientB","ingredientC","ingredientDD","ingredeintFF"),
DF=c("Tablet","Powder","Powder","Suspension","injection","tablet"),
Route=c("ORAL","INHALATION","INHALATION","topical","injecatable","oral")
)
我想首先在以下变量上加入这两个数据集+创建一个名为“match”的新列来描述连接:
1) identifier,key, product, ingredient,DF
match="identifier,key, product, ingredient,DF"
然后,我想加入这些变量的剩余行:
2)identifier, key, product, DF
match="identifier,key, product,DF"
然后是步骤 2 中关于这些变量的剩余行,依此类推。
3) identifier, key, Ingredient, DF
4) identifier, key, DF
5) identifer, key, product, ingredient
7) identifer, key, product
8) identifer, key, ingredient
9) identifier, key
我也想返回不匹配的行。我知道如何逐步做到这一点,但我想知道是否有更简单的方法来做到这一点?
这是预期的输出:
df_out<-data.frame(
identifer=c(1,2,3,4,5,6),
key=c("A","B","C","D","E","F"),
product_1=c("productA","productB","productC","productD","productE","productF"),
ingredient_1=c("ingredientA","ingredientB","ingredientC","ingredientD","ingredientE","ingredientF"),
DF_1=c("Tablet","Powder","Suspension","System","Capsule","Capsule"),
product_2=c("productA","productB","productCC","productDD",NA,"productFF"),
ingredient_2=c("ingredientA","ingredientB","ingredientC","ingredientDD",NA,"ingredeintFF"),
DF_2=c("Tablet","Powder","Suspension","injection",NA,"tablet"),
Route_2=c("ORAL","INHALATION",'topical',"injecatable",NA,"oral"),
Match=c("identifer+key+product+ingredient+DF","identifier+key+product+ingredient+DF","identifier+key+ingredient+DF","identifer+key","None","identifer+key+product+ingredient"))
解决方案
这是使用的选项data.table
:
library(data.table)
setDT(df)
setDT(df_2)
keyord <- list(
c("product", "ingredient", "DF"),
c("product", "DF"),
c("ingredient", "DF"),
"DF",
c("product", "ingredient"),
"product",
"ingredient",
c()
)
cols <- c("product", "ingredient", "DF", "Route")
df[, Match := NA_character_]
for (v in keyord) {
k <- c("identifier", "key", v)
df[df_2, on=k, c(paste0(cols, "_2"), "check") := c(mget(paste0("i.", cols)), .(TRUE))]
df[is.na(Match) & check, Match := toString(k)]
}
setnames(df, cols, paste0(cols, "_1"), skip_absent=TRUE)
输出:
id identifier key product_1 ingredient_1 DF_1 Match product_2 ingredient_2 DF_2 Route_2 check
1: 1-1 1 A productA ingredientA Tablet identifier, key, product, ingredient, DF productA ingredientA Tablet ORAL TRUE
2: 2-2 2 B productB ingredientB Powder identifier, key, product, ingredient, DF productB ingredientB Powder INHALATION TRUE
3: 3-3 3 C productC ingredientC Suspension identifier, key, ingredient, DF productCC ingredientC Suspension topical TRUE
4: 4-4 4 D productD ingredientD System identifier, key productDD ingredientDD injection injecatable TRUE
5: 5-5 5 E productE ingredientE Capsule <NA> <NA> <NA> <NA> <NA> NA
6: 6-6 6 F productF ingredientF Capsule identifier, key, product, ingredient productF ingredientF tablet oral TRUE
修复OP中的一些错别字后的数据:
df <- data.frame(
id=c("1-1","2-2","3-3","4-4","5-5","6-6"),
identifier=c(1,2,3,4,5,6),
key=c("A","B","C","D","E","F"),
product=c("productA","productB","productC","productD","productE","productF"),
ingredient=c("ingredientA","ingredientB","ingredientC","ingredientD","ingredientE","ingredientF"),
DF=c("Tablet","Powder","Suspension","System","Capsule","Capsule"))
df_2 <- data.frame(
identifier=c(1,2,2,3,4,6),
key=c("A","B","B","C","D","F"),
product=c("productA","productB","productB","productCC","productDD","productF"),
ingredient=c("ingredientA","ingredientBB","ingredientB","ingredientC","ingredientDD","ingredientF"),
DF=c("Tablet","Powder","Powder","Suspension","injection","tablet"),
Route=c("ORAL","INHALATION","INHALATION","topical","injecatable","oral")
)
编辑多个匹配项:
df_2 <- data.frame( identifier=c(1,2,2,3,4,4,6), key=c("A","B","B","C","D","D","F"), product=c("productA","productB","productB","productCC","productDD","productDd","productF"), ingredient=c("ingredientA","ingredientBB","ingredientB","ingredientC","ingredientDD",NA,"ingredientF"), DF=c("Tablet","Powder","Powder","Suspension","injection",NA,"tablet"), Route=c("ORAL","INHALATION","INHALATION","topical","injecatable",NA,"oral") )
setDT(df_2)
df[, c("Match", "check") := .(NA_character_, FALSE)]
ocols <- unique(unlist(keyord))
rbindlist(lapply(keyord, function(v) {
k <- c("identifier", "key", v)
a <- df_2[df[(!check)], on=k, nomatch=0L, c(.(id=id),
setNames(mget(paste0("i.", ocols)), paste0(ocols, "_1")),
setNames(mget(paste0("x.", c(ocols, "Route"))), paste0(c(ocols, "Route"), "_2")))
]
df[id %chin% a$id, check := TRUE]
a
}), use.names=TRUE)
输出:
id product_1 ingredient_1 DF_1 product_2 ingredient_2 DF_2 Route_2
1: 1-1 productA ingredientA Tablet productA ingredientA Tablet ORAL
2: 2-2 productB ingredientB Powder productB ingredientB Powder INHALATION
3: 3-3 productC ingredientC Suspension productCC ingredientC Suspension topical
4: 6-6 productF ingredientF Capsule productF ingredientF tablet oral
5: 4-4 productD ingredientD System productDD ingredientDD injection injecatable
6: 4-4 productD ingredientD System productDd <NA> <NA> <NA>
推荐阅读
- php - 如何使用 input name="value[]" 将不同的值插入到 db
- markdown - 这是什么降价?
- c# - 将上下文属性添加到 Serilog 异常记录
- latex - 如何更改默认自动引用类别以将自动引用与不受支持的语言一起使用?
- javascript - 函数中何时允许使用速记?
- xna - 在顶点着色器novadays中使用纹理的方法是什么?
- android - React Native app.json 重复资源
- android - Xamarin.Android、AndroidManifest.xml、AndroidManifestMerger、xmlns:tools 架构,始终手动替换
- c++ - [hh:mm:ss:msms] 格式的时间戳 C++
- embedded-linux - 关于 yocto /etc/os-release 版本控制