首页 > 解决方案 > Filter columns based on the list of columns and list of values

问题描述

Is there a way how to filter in R (using tidyverse or data.table) when having lists of columns and equal values?

One list contains columns to be filtered, the second list contains which values of columns on the first list shall be selected respectively.


library(tidyverse)

df <- tibble(mtcars)

col1 <- c('mpg', 'am')
val1 <- c(21, 1)

# Pseudo idea of what I need
df %>%
  filter(col1 == val1)

# This is the results that shall be obtained by using col1 and val1

df %>% 
  filter(mpg == 21,
         am == 1)


col2 <- c('mpg', 'am', 'carb', 'vs')
val2 <- c(21, 1, 4, 0)

df %>%
  filter(col2 == val2)

However, it is very important that this should be generalizable and only filter condition will always be ==.

ALSO, it needs to be automated so it work for lists with only one element e.g.:

col4 <- c('vs')
val4 <- c(0)

标签: rdplyrdatatabletidyverse

解决方案


在较新的版本中dplyr,我们也可以使用if_all(相当于all它会返回所有复合逻辑表达式被&逻辑运算符折叠的行)。一个|if_any)。在这种情况下,我们可以有一个named'val's的向量

library(dplyr)
names(val1) <- col1
df %>%
    filter(if_all(all_of(col1), ~ . == val1[cur_column()]))
# A tibble: 2 x 11
#   mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1    21     6   160   110   3.9  2.62  16.5     0     1     4     4
#2    21     6   160   110   3.9  2.88  17.0     0     1     4     4

对于第二种情况

names(val2) <- col2
df %>% 
    filter(if_all(all_of(col2), ~ . == val2[cur_column()]))
# A tibble: 2 x 11
#    mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1    21     6   160   110   3.9  2.62  16.5     0     1     4     4
#2    21     6   160   110   3.9  2.88  17.0     0     1     4     4 

或者对于一个元素向量

names(val4) <- col4
df %>% 
   filter(if_all(all_of(col4), ~ . == val4[cur_column()]))
# A tibble: 18 x 11
#     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1  21       6  160    110  3.9   2.62  16.5     0     1     4     4
# 2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
# 3  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2
# 4  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4
# 5  16.4     8  276.   180  3.07  4.07  17.4     0     0     3     3
# 6  17.3     8  276.   180  3.07  3.73  17.6     0     0     3     3
# 7  15.2     8  276.   180  3.07  3.78  18       0     0     3     3
# 8  10.4     8  472    205  2.93  5.25  18.0     0     0     3     4
# 9  10.4     8  460    215  3     5.42  17.8     0     0     3     4
#10  14.7     8  440    230  3.23  5.34  17.4     0     0     3     4
#11  15.5     8  318    150  2.76  3.52  16.9     0     0     3     2
#12  15.2     8  304    150  3.15  3.44  17.3     0     0     3     2
#13  13.3     8  350    245  3.73  3.84  15.4     0     0     3     4
#14  19.2     8  400    175  3.08  3.84  17.0     0     0     3     2
#15  26       4  120.    91  4.43  2.14  16.7     0     1     5     2
#16  15.8     8  351    264  4.22  3.17  14.5     0     1     5     4
#17  19.7     6  145    175  3.62  2.77  15.5     0     1     5     6
#18  15       8  301    335  3.54  3.57  14.6     0     1     5     8

或者另一种选择是创建一个表达式并用parse_expr

library(stringr)
df %>%
   filter(!! rlang::parse_expr(str_c(col1, val1, sep = '==', collapse="&")))

第二种情况也采用相同的方法

df %>%
   filter(!! rlang::parse_expr(str_c(col2, val2, sep = '==', collapse="&")))

# A tibble: 2 x 11
#    mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1    21     6   160   110   3.9  2.62  16.5     0     1     4     4
#2    21     6   160   110   3.9  2.88  17.0     0     1     4     4

或使用更新的案例

df %>%
  filter(!! rlang::parse_expr(str_c(col4, val4, sep = '==', collapse="&")))

# A tibble: 18 x 11
#     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1  21       6  160    110  3.9   2.62  16.5     0     1     4     4
# 2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
# 3  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2
# 4  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4
# 5  16.4     8  276.   180  3.07  4.07  17.4     0     0     3     3
# 6  17.3     8  276.   180  3.07  3.73  17.6     0     0     3     3
# 7  15.2     8  276.   180  3.07  3.78  18       0     0     3     3
# 8  10.4     8  472    205  2.93  5.25  18.0     0     0     3     4
# 9  10.4     8  460    215  3     5.42  17.8     0     0     3     4
#10  14.7     8  440    230  3.23  5.34  17.4     0     0     3     4
#11  15.5     8  318    150  2.76  3.52  16.9     0     0     3     2
#12  15.2     8  304    150  3.15  3.44  17.3     0     0     3     2
#13  13.3     8  350    245  3.73  3.84  15.4     0     0     3     4
#14  19.2     8  400    175  3.08  3.84  17.0     0     0     3     2
#15  26       4  120.    91  4.43  2.14  16.7     0     1     5     2
#16  15.8     8  351    264  4.22  3.17  14.5     0     1     5     4
#17  19.7     6  145    175  3.62  2.77  15.5     0     1     5     6
#18  15       8  301    335  3.54  3.57  14.6     0     1     5     8

或者可以使用map2/reduce. 在filter'col2'、'val2' 值上循环,根据 'col2' 检查数据是否等于相应的 'val2' 值,reducelist逻辑s 转换为vector单个逻辑vector&filter

library(purrr)
df %>%
  filter(map2(col2, val2, ~  cur_data()[[.x]] == .y) %>% reduce(`&`))

rowSums或者另一种选择是使用afterselect从数据集中感兴趣的列创建逻辑表达式

df %>%
   filter(!rowSums(select(., col2) != val2[col(select(., col2))]))

或使用base R

subset(df, Reduce(`&`, Map(`==`, df[col2], val2)))

可以创建一个函数来做到这一点

f1 <- function(dat, colnms, vals) {

  dat %>%
     filter(map2(colnms, vals, ~  cur_data()[[.x]] == .y) %>%
      reduce(`&`))


 }

-测试

f1(df, col1, val1)
# A tibble: 2 x 11
    mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1    21     6   160   110   3.9  2.62  16.5     0     1     4     4
2    21     6   160   110   3.9  2.88  17.0     0     1     4     4
f1(df, col2, val2)
# A tibble: 2 x 11
    mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1    21     6   160   110   3.9  2.62  16.5     0     1     4     4
2    21     6   160   110   3.9  2.88  17.0     0     1     4     4
f1(df, col4, val4)
# A tibble: 18 x 11
     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1  21       6  160    110  3.9   2.62  16.5     0     1     4     4
 2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
 3  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2
 4  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4
 5  16.4     8  276.   180  3.07  4.07  17.4     0     0     3     3
 6  17.3     8  276.   180  3.07  3.73  17.6     0     0     3     3
 7  15.2     8  276.   180  3.07  3.78  18       0     0     3     3
 8  10.4     8  472    205  2.93  5.25  18.0     0     0     3     4
 9  10.4     8  460    215  3     5.42  17.8     0     0     3     4
10  14.7     8  440    230  3.23  5.34  17.4     0     0     3     4
11  15.5     8  318    150  2.76  3.52  16.9     0     0     3     2
12  15.2     8  304    150  3.15  3.44  17.3     0     0     3     2
13  13.3     8  350    245  3.73  3.84  15.4     0     0     3     4
14  19.2     8  400    175  3.08  3.84  17.0     0     0     3     2
15  26       4  120.    91  4.43  2.14  16.7     0     1     5     2
16  15.8     8  351    264  4.22  3.17  14.5     0     1     5     4
17  19.7     6  145    175  3.62  2.77  15.5     0     1     5     6
18  15       8  301    335  3.54  3.57  14.6     0     1     5     8

推荐阅读