首页 > 解决方案 > Rowwise,如何指定某个值来自哪一列?

问题描述

我有一个包含几列的数据框,我创建了一个新列,它从其他列中随机采样一个值。如何回溯以判断值来自哪一列?

我在这里看到了完全相同的问题和解决方案,但它在 python 中,找不到 R 等价物。

数据 1 :: 每行在列之间具有不同的值

df_uniques <-
  data.frame(
    col_a = c(2, 2, 5, 5, 3),
    col_b = c(NA, 4, 2, 3, 1),
    col_c = c(4, 5, 3, 1, 2),
    col_d = c(1, NA, 4, 2, 4),
    col_e = c(3, 3, 1, 4, 5)
  )

> df_uniques

##   col_a col_b col_c col_d col_e
## 1     2    NA     4     1     3
## 2     2     4     5    NA     3
## 3     5     2     3     4     1
## 4     5     3     1     2     4
## 5     3     1     2     4     5

变异一个新列以从之前的列中采样

library(dplyr)

set.seed(2020)

df_uniques %>%
  rowwise() %>%
  mutate(sampled = sample(c(col_a, col_b, col_c, col_d, col_e), size = n()))

## # A tibble: 5 x 6
## # Rowwise: 
##   col_a col_b col_c col_d col_e sampled
##   <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1     2    NA     4     1     3       1
## 2     2     4     5    NA     3      NA
## 3     5     2     3     4     1       5
## 4     5     3     1     2     4       5
## 5     3     1     2     4     5       4

数据 2 :: 每行在列之间具有重复值

df_duplicates <-
  data.frame(
    col_a = c(1, 4, 2, 5, 2),
    col_b = c(NA, 4, NA, 3, 1),
    col_c = c(4, NA, 5, NA, NA),
    col_d = c(1, NA, NA, 2, NA),
    col_e = c(2, 3, NA, NA, 5)
  )

> df_duplicates
##   col_a col_b col_c col_d col_e
## 1     1    NA     4     1     2     
## 2     4     4    NA    NA     3
## 3     2    NA     5    NA    NA
## 4     5     3    NA     2    NA
## 5     2     1    NA    NA     5

变异一个新列以从之前的列中采样

set.seed(2020)

df_duplicates %>%
  rowwise() %>%
  mutate(sampled = sample(c(col_a, col_b, col_c, col_d, col_e), size = n()))

## # A tibble: 5 x 6
## # Rowwise: 
##   col_a col_b col_c col_d col_e sampled
##   <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1     1    NA     4     1     2      NA
## 2     4     4    NA    NA     3       4
## 3     2    NA     5    NA    NA      NA
## 4     5     3    NA     2    NA       3
## 5     2     1    NA    NA     5       1

追溯:哪一列是原点sampled

期望的输出(数据 1 :: uniques)

# A tibble: 5 x 7
# Rowwise: 
  col_a col_b col_c col_d col_e sampled origin_col
  <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>     
1     2    NA     4     1     3       1 col_d     
2     2     4     5    NA     3      NA col_d     
3     5     2     3     4     1       5 col_a     
4     5     3     1     2     4       5 col_a     
5     3     1     2     4     5       4 col_d 

期望的输出(数据 2 :: 重复)

# A tibble: 5 x 7
# Rowwise: 
  col_a col_b col_c col_d col_e sampled origin_col   
  <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>        
1     1    NA     4     1     2       1 col_a, col_d
2     4     4    NA    NA     3      NA col_c, col_d 
3     2    NA     5    NA    NA       2 col_a        
4     5     3    NA     2    NA       5 col_a        
5     2     1    NA    NA     5      NA col_c, col_d 

标签: rdataframedplyr

解决方案


你在寻找这样的东西吗?

cols <- c("col_a", "col_b", "col_c", "col_d", "col_e")
workflow <- 
  . %>%
  rowwise() %>%
  mutate(
    sampled = sample(c_across(!!cols), 1L), 
    origin_col = toString(cols[which(c_across(!!cols) %in% sampled)])
  )

输出

> set.seed(2020L); workflow(df_uniques)
# A tibble: 5 x 7
# Rowwise: 
  col_a col_b col_c col_d col_e sampled origin_col
  <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>       
1     2    NA     4     1     3       1 col_d       
2     2     4     5    NA     3      NA col_d       
3     5     2     3     4     1       5 col_a       
4     5     3     1     2     4       5 col_a       
5     3     1     2     4     5       4 col_d       

> set.seed(2020L); workflow(df_duplicates)
# A tibble: 5 x 7
# Rowwise: 
  col_a col_b col_c col_d col_e sampled origin_col
  <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>       
1     1    NA     4     1     2       1 col_a, col_d
2     4     4    NA    NA     3      NA col_c, col_d
3     2    NA     5    NA    NA       2 col_a       
4     5     3    NA     2    NA       5 col_a       
5     2     1    NA    NA     5      NA col_c, col_d

方法 1:为您选择的列创建一个临时变量

workflow <- 
  . %>%
  rowwise() %>%
  mutate(
    d = across(starts_with("col_")),
    sampled = sample(c_across(names(d)), 1L),
    original_col = toString(names(d)[which(c_across(names(d)) %in% sampled)]), 
    d = NULL
  )

方法2:将所有内容包装在一个函数中

workflow <- function(df) {
  cols <- names(df)
  cols <- cols[starts_with("col_", vars = cols)]
  # or cols <- cols[startsWith(cols, "col_")]
  # or cols <- cols[grepl("^col_", cols)]
  # ...
  df %>%
    rowwise() %>%
    mutate(
      sampled = sample(c_across(!!cols), 1L),
      original_col = toString(cols[which(c_across(!!cols) %in% sampled)]),
    )
}

我更喜欢第二种方法,因为它更灵活。


推荐阅读