r - 将数据框与缺失值相结合
问题描述
我有几个数据框,其中包含来自同一调查的数据。我想将它们结合起来进行分析。数据帧包含唯一变量和在所有数据帧之间共享的两个变量(ID 和 Contest_no);这两个共享变量包含有关受访者和比赛编号的信息(1,2,3,因为受访者被问了三次相同的问题)。
困难在于数据框有缺失值:
DF1 <- data.frame(V1 = factor(c("A", "B", "C", "D")),
V2 = factor(c("A", "B", "C", "D")),
ID = factor(c("x1", "x1", "y2", "y2")),
Contest_no = factor(c("1", "2", "1", "2")))
DF2 <- data.frame(V3 = factor(c("A", "C", "D")),
V4 = factor(c("A", "C", "D")),
ID = factor(c("x1", "y2", "y2")),
Contest_no = factor(c("1", "1", "2")))
DF3 <- data.frame(V5 = factor(c("A", "B", "C")),
V6 = factor(c("A", "B", "C")),
ID = factor(c("x1", "x1", "y2")),
Contest_no = factor(c("1", "2", "1")))
因此,受访者 ID 和竞赛编号不一致。我想将数据与受访者 IDS 和比赛编号相匹配,以便合并的数据框如下所示:
DF_merged <- data.frame(V1 = factor(c("A", "B", "C", "D")),
V2 = factor(c("A", "B", "C", "D")),
V3 = factor(c("A", NA, "C", "D")),
V4 = factor(c("A", NA, "C", "D")),
V5 = factor(c("A", "B", "C", NA)),
V6 = factor(c("A", "B", "C", NA)),
ID = factor(c("x1", "x1", "y2", "y2")),
Contest_no = factor(c("1", "2", "1", "2")))
我认为这full_join
可以解决问题,但DF_merged <- full_join(DF1, DF2, DF3, by="ID")
给了我荒谬的结果。
如何组合这样的不同数据?
新的、更新的示例(解决多行问题)。在此示例中,根本没有缺失值,并且两个数据帧具有相同的行数,但代码导致行数相乘。首先,要合并的两个数据框:
df1:
structure(list(ID = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L), .Label = c("EE1", "EE101", "EE102"), class = "factor"),
Contest_no = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 2L, 2L, 3L,
3L), Option = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L), .Label = c("Option1", "Option2"), class = "factor"),
Chosen_option = c(0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L,
0L, 1L), Combination = structure(c(5L, 5L, 6L, 6L, 4L, 4L,
2L, 2L, 1L, 1L, 3L, 3L), .Label = c("V133", "V181", "V234",
"V252", "V32", "V67"), class = "factor"), Attribute1 = structure(c(1L,
1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L), .Label = c("has strong ties to the government",
"has weak ties to the government"), class = "factor"), Attribute2 = structure(c(1L,
2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L), .Label = c("has strong ties to the local pastoralist community",
"has weak ties to the local pastoralist community"), class = "factor"),
Attribute3 = structure(c(2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L,
2L, 1L, 1L, 2L), .Label = c("is poor", "is wealthy"), class = "factor"),
Attribute4 = structure(c(2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L,
1L, 2L, 2L, 2L), .Label = c("has attained a high level of formal education (for example university degree)",
"has not attained a high level of formal education (for example never went to school or only attended primary school)"
), class = "factor")), .Names = c("ID", "Contest_no", "Option",
"Chosen_option", "Combination", "Attribute1", "Attribute2", "Attribute3",
"Attribute4"), class = "data.frame", row.names = c(NA, -12L))
df2:
structure(list(ID = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L), .Label = c("EE1", "EE101", "EE102"), class = "factor"),
Contest_no = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 2L, 2L, 3L,
3L), Option = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L), .Label = c("Option1", "Option2"), class = "factor"),
Chosen_option = c(1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L,
0L, 1L), Combination = structure(c(6L, 6L, 4L, 4L, 1L, 1L,
3L, 3L, 5L, 5L, 2L, 2L), .Label = c("V150", "V249", "V252",
"V29", "V56", "V77"), class = "factor"), Attribute1 = structure(c(2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("has strong ties to the government",
"has weak ties to the government"), class = "factor"), Attribute2 = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("has strong ties to the local pastoralist community",
"has weak ties to the local pastoralist community"), class = "factor"),
Attribute3 = structure(c(2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L,
2L, 1L, 1L, 2L), .Label = c("is poor", "is wealthy"), class = "factor"),
Attribute4 = structure(c(2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L,
1L, 1L, 2L, 2L), .Label = c("has attained a high level of formal education (for example university degree)",
"has not attained a high level of formal education (for example never went to school or only attended primary school)"
), class = "factor")), .Names = c("ID", "Contest_no", "Option",
"Chosen_option", "Combination", "Attribute1", "Attribute2", "Attribute3",
"Attribute4"), class = "data.frame", row.names = c(NA, -12L))
现在尝试组合两个数据框的失败:
df_merge_attempt <- dplyr::full_join(df1, df2, by=c("ID","Contest_no"))
结果是:
structure(list(ID = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L
), .Label = c("EE1", "EE101", "EE102"), class = "factor"), Contest_no = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L), Option.x = structure(c(1L, 1L, 2L,
2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L,
2L, 1L, 1L, 2L, 2L), .Label = c("Option1", "Option2"), class = "factor"),
Chosen_option.x = c(0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L,
1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L),
Combination.x = structure(c(5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L,
4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 3L, 3L, 3L,
3L), .Label = c("V133", "V181", "V234", "V252", "V32", "V67"
), class = "factor"), Attribute1.x = structure(c(1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L,
2L, 2L, 2L, 1L, 1L, 2L, 2L), .Label = c("has strong ties to the government",
"has weak ties to the government"), class = "factor"), Attribute2.x = structure(c(1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L), .Label = c("has strong ties to the local pastoralist community",
"has weak ties to the local pastoralist community"), class = "factor"),
Attribute3.x = structure(c(2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L,
2L), .Label = c("is poor", "is wealthy"), class = "factor"),
Attribute4.x = structure(c(2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("has attained a high level of formal education (for example university degree)",
"has not attained a high level of formal education (for example never went to school or only attended primary school)"
), class = "factor"), Option.y = structure(c(1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L), .Label = c("Option1", "Option2"), class = "factor"),
Chosen_option.y = c(1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L,
1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L),
Combination.y = structure(c(6L, 6L, 6L, 6L, 4L, 4L, 4L, 4L,
1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 5L, 5L, 5L, 5L, 2L, 2L, 2L,
2L), .Label = c("V150", "V249", "V252", "V29", "V56", "V77"
), class = "factor"), Attribute1.y = structure(c(2L, 2L,
2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 2L, 1L), .Label = c("has strong ties to the government",
"has weak ties to the government"), class = "factor"), Attribute2.y = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L), .Label = c("has strong ties to the local pastoralist community",
"has weak ties to the local pastoralist community"), class = "factor"),
Attribute3.y = structure(c(2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L,
2L), .Label = c("is poor", "is wealthy"), class = "factor"),
Attribute4.y = structure(c(2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L), .Label = c("has attained a high level of formal education (for example university degree)",
"has not attained a high level of formal education (for example never went to school or only attended primary school)"
), class = "factor")), class = "data.frame", row.names = c(NA,
-24L), .Names = c("ID", "Contest_no", "Option.x", "Chosen_option.x",
"Combination.x", "Attribute1.x", "Attribute2.x", "Attribute3.x",
"Attribute4.x", "Option.y", "Chosen_option.y", "Combination.y",
"Attribute1.y", "Attribute2.y", "Attribute3.y", "Attribute4.y"
))
解决方案
您可以尝试dplyr::full_join
使用by=c("ID","Contest_no")
以下参数:
library(dplyr)
df1 <- full_join(DF1, DF2, by=c("ID","Contest_no")) %>%
full_join(DF3, by=c("ID","Contest_no"))
df1
# V1 V2 V3 V4 V5 V6 ID Contest_no
#1 A A A A A A x1 1
#2 B B <NA> <NA> B B x1 2
#3 C C C C C C y2 1
#4 D D D D <NA> <NA> y2 2
更新:答案已修改为将另一列Option
视为full_join
:
df1 <- full_join(DF1, DF2, by=c("ID","Contest_no", "Option"))
注意:我必须调整我dplyr
以匹配@Gregor 的建议以获得预期的结果。
推荐阅读
- vue.js - 是否可以从 vue-dropzone2 中删除 URL 选项?
- javascript - 在一个函数中反转字符串并删除重复元素
- powershell - 比较用户 AD 和 CSV 文件列 Powershell
- node.js - NodeJS Promise Map 没有等待 Sequelize Query
- api-platform.com - 禁用特定格式的分页
- javascript - 状态未在控制台中显示更新的值
- ios - 如何使用 Tuist iOS 启用功能
- javascript - 打开一个项目时如何关闭所有项目
- youtrack - YouTrack 如何在问题表中显示冲刺信息
- xsl-fo - 为什么 fo:footnote 丢失