首页 > 解决方案 > 将数据框与缺失值相结合

问题描述

我有几个数据框,其中包含来自同一调查的数据。我想将它们结合起来进行分析。数据帧包含唯一变量和在所有数据帧之间共享的两个变量(ID 和 Contest_no);这两个共享变量包含有关受访者和比赛编号的信息(1,2,3,因为受访者被问了三次相同的问题)。

困难在于数据框有缺失值:

DF1 <- data.frame(V1 = factor(c("A", "B", "C", "D")),
             V2 = factor(c("A", "B", "C", "D")),
             ID = factor(c("x1", "x1", "y2", "y2")),
             Contest_no = factor(c("1", "2", "1", "2")))

DF2 <- data.frame(V3 = factor(c("A", "C", "D")),
                  V4 = factor(c("A", "C", "D")),
                  ID = factor(c("x1", "y2", "y2")),
                  Contest_no = factor(c("1", "1", "2")))

DF3 <- data.frame(V5 = factor(c("A", "B", "C")),
                  V6 = factor(c("A", "B", "C")),
                  ID = factor(c("x1", "x1", "y2")),
                  Contest_no = factor(c("1", "2", "1")))

因此,受访者 ID 和竞赛编号不一致。我想将数据与受访者 IDS 和比赛编号相匹配,以便合并的数据框如下所示:

DF_merged <- data.frame(V1 = factor(c("A", "B", "C", "D")),
                    V2 = factor(c("A", "B", "C", "D")),
                    V3 = factor(c("A", NA, "C", "D")),
                    V4 = factor(c("A", NA, "C", "D")),
                    V5 = factor(c("A", "B", "C", NA)),
                    V6 = factor(c("A", "B", "C", NA)),
                    ID = factor(c("x1", "x1", "y2", "y2")),
                    Contest_no = factor(c("1", "2", "1", "2")))  

我认为这full_join可以解决问题,但DF_merged <- full_join(DF1, DF2, DF3, by="ID")给了我荒谬的结果。

如何组合这样的不同数据?

新的、更新的示例(解决多行问题)。在此示例中,根本没有缺失值,并且两个数据帧具有相同的行数,但代码导致行数相乘。首先,要合并的两个数据框:

df1:

structure(list(ID = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L), .Label = c("EE1", "EE101", "EE102"), class = "factor"), 
    Contest_no = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 2L, 2L, 3L, 
    3L), Option = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 
    1L, 2L, 1L, 2L), .Label = c("Option1", "Option2"), class = "factor"), 
    Chosen_option = c(0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 
    0L, 1L), Combination = structure(c(5L, 5L, 6L, 6L, 4L, 4L, 
    2L, 2L, 1L, 1L, 3L, 3L), .Label = c("V133", "V181", "V234", 
    "V252", "V32", "V67"), class = "factor"), Attribute1 = structure(c(1L, 
    1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L), .Label = c("has strong ties to the government", 
    "has weak ties to the government"), class = "factor"), Attribute2 = structure(c(1L, 
    2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L), .Label = c("has strong ties to the local pastoralist community", 
    "has weak ties to the local pastoralist community"), class = "factor"), 
    Attribute3 = structure(c(2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 
    2L, 1L, 1L, 2L), .Label = c("is poor", "is wealthy"), class = "factor"), 
    Attribute4 = structure(c(2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 
    1L, 2L, 2L, 2L), .Label = c("has attained a high level of formal education (for example university degree)", 
    "has not attained a high level of formal education (for example never went to school or only attended primary school)"
    ), class = "factor")), .Names = c("ID", "Contest_no", "Option", 
"Chosen_option", "Combination", "Attribute1", "Attribute2", "Attribute3", 
"Attribute4"), class = "data.frame", row.names = c(NA, -12L))

df2:

structure(list(ID = structure(c(1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 
3L, 3L, 3L, 3L), .Label = c("EE1", "EE101", "EE102"), class = "factor"), 
    Contest_no = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 2L, 2L, 3L, 
    3L), Option = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 
    1L, 2L, 1L, 2L), .Label = c("Option1", "Option2"), class = "factor"), 
    Chosen_option = c(1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 
    0L, 1L), Combination = structure(c(6L, 6L, 4L, 4L, 1L, 1L, 
    3L, 3L, 5L, 5L, 2L, 2L), .Label = c("V150", "V249", "V252", 
    "V29", "V56", "V77"), class = "factor"), Attribute1 = structure(c(2L, 
    2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("has strong ties to the government", 
    "has weak ties to the government"), class = "factor"), Attribute2 = structure(c(2L, 
    2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("has strong ties to the local pastoralist community", 
    "has weak ties to the local pastoralist community"), class = "factor"), 
    Attribute3 = structure(c(2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 
    2L, 1L, 1L, 2L), .Label = c("is poor", "is wealthy"), class = "factor"), 
    Attribute4 = structure(c(2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 
    1L, 1L, 2L, 2L), .Label = c("has attained a high level of formal education (for example university degree)", 
    "has not attained a high level of formal education (for example never went to school or only attended primary school)"
    ), class = "factor")), .Names = c("ID", "Contest_no", "Option", 
"Chosen_option", "Combination", "Attribute1", "Attribute2", "Attribute3", 
"Attribute4"), class = "data.frame", row.names = c(NA, -12L))

现在尝试组合两个数据框的失败:

df_merge_attempt <- dplyr::full_join(df1, df2, by=c("ID","Contest_no"))

结果是:

structure(list(ID = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L
), .Label = c("EE1", "EE101", "EE102"), class = "factor"), Contest_no = c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 2L, 
2L, 2L, 2L, 3L, 3L, 3L, 3L), Option.x = structure(c(1L, 1L, 2L, 
2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 
2L, 1L, 1L, 2L, 2L), .Label = c("Option1", "Option2"), class = "factor"), 
    Chosen_option.x = c(0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 
    1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L), 
    Combination.x = structure(c(5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 
    4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 
    3L), .Label = c("V133", "V181", "V234", "V252", "V32", "V67"
    ), class = "factor"), Attribute1.x = structure(c(1L, 1L, 
    1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 
    2L, 2L, 2L, 1L, 1L, 2L, 2L), .Label = c("has strong ties to the government", 
    "has weak ties to the government"), class = "factor"), Attribute2.x = structure(c(1L, 
    1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 
    2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L), .Label = c("has strong ties to the local pastoralist community", 
    "has weak ties to the local pastoralist community"), class = "factor"), 
    Attribute3.x = structure(c(2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 
    1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 
    2L), .Label = c("is poor", "is wealthy"), class = "factor"), 
    Attribute4.x = structure(c(2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 
    2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 
    2L), .Label = c("has attained a high level of formal education (for example university degree)", 
    "has not attained a high level of formal education (for example never went to school or only attended primary school)"
    ), class = "factor"), Option.y = structure(c(1L, 2L, 1L, 
    2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 
    1L, 2L, 1L, 2L, 1L, 2L), .Label = c("Option1", "Option2"), class = "factor"), 
    Chosen_option.y = c(1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 
    1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L), 
    Combination.y = structure(c(6L, 6L, 6L, 6L, 4L, 4L, 4L, 4L, 
    1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 5L, 5L, 5L, 5L, 2L, 2L, 2L, 
    2L), .Label = c("V150", "V249", "V252", "V29", "V56", "V77"
    ), class = "factor"), Attribute1.y = structure(c(2L, 2L, 
    2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 2L, 1L, 2L, 1L), .Label = c("has strong ties to the government", 
    "has weak ties to the government"), class = "factor"), Attribute2.y = structure(c(2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L), .Label = c("has strong ties to the local pastoralist community", 
    "has weak ties to the local pastoralist community"), class = "factor"), 
    Attribute3.y = structure(c(2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 
    2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 
    2L), .Label = c("is poor", "is wealthy"), class = "factor"), 
    Attribute4.y = structure(c(2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
    1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 
    2L), .Label = c("has attained a high level of formal education (for example university degree)", 
    "has not attained a high level of formal education (for example never went to school or only attended primary school)"
    ), class = "factor")), class = "data.frame", row.names = c(NA, 
-24L), .Names = c("ID", "Contest_no", "Option.x", "Chosen_option.x", 
"Combination.x", "Attribute1.x", "Attribute2.x", "Attribute3.x", 
"Attribute4.x", "Option.y", "Chosen_option.y", "Combination.y", 
"Attribute1.y", "Attribute2.y", "Attribute3.y", "Attribute4.y"
))

标签: rdataframedplyr

解决方案


您可以尝试dplyr::full_join使用by=c("ID","Contest_no")以下参数:

library(dplyr)
df1 <- full_join(DF1, DF2, by=c("ID","Contest_no")) %>%
       full_join(DF3, by=c("ID","Contest_no"))
df1
#  V1 V2   V3   V4   V5   V6 ID Contest_no
#1  A  A    A    A    A    A x1          1
#2  B  B <NA> <NA>    B    B x1          2
#3  C  C    C    C    C    C y2          1
#4  D  D    D    D <NA> <NA> y2          2

更新:答案已修改为将另一列Option视为full_join

df1 <- full_join(DF1, DF2, by=c("ID","Contest_no", "Option")) 

注意:我必须调整我dplyr以匹配@Gregor 的建议以获得预期的结果。


推荐阅读