首页 > 解决方案 > 用基于 R 中特定 col 的虚拟值数据填充数据帧

问题描述

我有一个这样的数据框:

df <- data.frame(stringsAsFactors=FALSE,
        member = c(1L, 1L, 2L, 1L, 1L, 1L, 1L, 4L, 3L, 5L),
      q_c3_1 = c("A", "B", "C", "A", "B", "C", "A", "B", "C", "A"),
      q_c4_1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
      q_c5_1 = c(1900L, 1900L, 1900L, 1900L, 1900L, 1900L, 1900L, 1900L, 1900L,
                 1900L),
      q_c6_1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
      q_c7_1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
      q_c3_2 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
      q_c4_2 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
      q_c5_2 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
      q_c6_2 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
      q_c7_2 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
      q_c3_3 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
      q_c4_3 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
      q_c5_3 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
      q_c6_3 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
      q_c7_3 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
      q_c3_4 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
      q_c4_4 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
      q_c5_4 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
      q_c6_4 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
      q_c7_4 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
      q_c3_5 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
      q_c4_5 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
      q_c5_5 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
      q_c6_5 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
      q_c7_5 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)
)

基于成员变量,我需要用虚拟数据填充相应的变量。例如,如果 member = 2 那么q_c3_2:q_c7_2应该有虚拟值 --> q_c3 =一些字符,如 "Arne"q_c4 与 1q_c5 与 1900q_c6q_c70,如果 member == 3 那么q_c3_2:q_c7_2并且q_c3_3:q_c7_3应该有虚拟值(与上面的虚拟值相同)等等。我如何使用 tidyverse 有效地做到这一点?谢谢

我想要的输出应该是这个数据框

df2 <- data.frame(stringsAsFactors=FALSE,
                  member = c(1L, 1L, 2L, 1L, 1L, 1L, 1L, 4L, 3L, 5L),
                  q_c3_1 = c("A", "B", "C", "A", "B", "C", "A", "B", "C", "A"),
                  q_c4_1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
                  q_c5_1 = c(1900L, 1900L, 1900L, 1900L, 1900L, 1900L, 1900L, 1900L, 1900L,
                             1900L),
                  q_c6_1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
                  q_c7_1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
                  q_c3_2 = c(NA, NA, "Arne", NA, NA, NA, NA, "Arne", "Arne", "Arne"),
                  q_c4_2 = c(NA, NA, 1L, NA, NA, NA, NA, 1L, 1L, 1L),
                  q_c5_2 = c(NA, NA, 1900L, NA, NA, NA, NA, 1900L, 1900L, 1900L),
                  q_c6_2 = c(NA, NA, 0L, NA, NA, NA, NA, 0L, 0L, 0L),
                  q_c7_2 = c(NA, NA, 0L, NA, NA, NA, NA, 0L, 0L, 0L),
                  q_c3_3 = c(NA, NA, NA, NA, NA, NA, NA, "Arne", "Arne", "Arne"),
                  q_c4_3 = c(NA, NA, NA, NA, NA, NA, NA, 1L, 1L, 1L),
                  q_c5_3 = c(NA, NA, NA, NA, NA, NA, NA, 1900L, 1900L, 1900L),
                  q_c6_3 = c(NA, NA, NA, NA, NA, NA, NA, 0L, 0L, 0L),
                  q_c7_3 = c(NA, NA, NA, NA, NA, NA, NA, 0L, 0L, 0L),
                  q_c3_4 = c(NA, NA, NA, NA, NA, NA, NA, "Arne", NA, "Arne"),
                  q_c4_4 = c(NA, NA, NA, NA, NA, NA, NA, 1L, NA, 1L),
                  q_c5_4 = c(NA, NA, NA, NA, NA, NA, NA, 1900L, NA, 1900L),
                  q_c6_4 = c(NA, NA, NA, NA, NA, NA, NA, 0L, NA, 0L),
                  q_c7_4 = c(NA, NA, NA, NA, NA, NA, NA, 0L, NA, 0L),
                  q_c3_5 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, "Arne"),
                  q_c4_5 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L),
                  q_c5_5 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 1900L),
                  q_c6_5 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 0L),
                  q_c7_5 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 0L)
)

标签: rdplyrtidyr

解决方案


假设虚拟变量和使用 dplyr 无关紧要:

library(dplyr)
temp <- df %>% 
  melt(id.vars = "member") %>% 
  mutate(compare = as.numeric(gsub("q_c\\d_(\\d)", "\\1", variable))) %>% 
  filter(compare <= member) %>% 
  mutate(value = "dummy",
     compare = NULL) %>%
  unique() %>% 
  spread(variable, value)

df <- df %>% 
  select(member) %>% 
  left_join(., temp, by = "member")

编辑:根据要求使用虚拟变量。

library(dplyr)
temp <- df %>% 
 melt(id.vars = "member") %>% 
 mutate(compare = as.numeric(gsub("q_c\\d_(\\d)", "\\1", variable)),
        dummy_match = as.numeric(gsub("q_c(\\d)_\\d", "\\1", variable))) %>% 
 filter(compare <= member) %>% 
 mutate(value = case_when(dummy_match == 4 ~ 1,
                          dummy_match == 5 ~ 1900,
                          dummy_match >= 6 ~ 0,
                          T ~ 9999),
        compare = NULL,
        dummy_match = NULL) %>%
 unique() %>% 
 spread(variable, value)

df <- df %>% 
 select(member) %>% 
 left_join(., temp, by = "member")

df[df == 9999] <- "Arne"

推荐阅读