首页 > 解决方案 > 尝试使用 dplyr 和 purrr 从另一个数据帧中的值生成数据帧

问题描述

我正在尝试创建一个基于另一个数据框的内容生成的数据框。在下面的示例中,我使用n_seqstibble 的列来指定rnorm函数的平均值,然后生成my_tibble. 的第一列my_tibble应该包含来自该group列的值,随后的列应该包含来自 running 的 10 个随机值rnorm。正如下面可重现的示例所示,我能够通过一种相当老套的方法来解决这个问题。

我不明白...

library(tidyverse)
my_tibble <- tibble(group=c("A", "B", "C"), n_seqs=c(5,7,10)) %>%
                                pull(n_seqs) %>%
                                map(function(x){ z <- rnorm(x, n=10); names(z) <- letters[1:10]; return(z) })
my_tibble
#> [[1]]
#>        a        b        c        d        e        f        g        h 
#> 6.518214 4.305639 6.106827 5.118304 4.255043 5.678025 4.345129 4.914239 
#>        i        j 
#> 6.727135 6.030590 
#> 
#> [[2]]
#>        a        b        c        d        e        f        g        h 
#> 7.969410 7.558780 8.265322 8.004338 6.862732 5.517313 8.061683 4.062385 
#>        i        j 
#> 6.693430 7.858993 
#> 
#> [[3]]
#>         a         b         c         d         e         f         g 
#>  9.066362  9.921300 10.724671  8.643903  9.783747  9.102569 10.489579 
#>         h         i         j 
#>  9.156070  9.863332 11.148255


#error
my_tibble %>% bind_rows(.)
#> Error in bind_rows_(x, .id): Argument 1 must have names


# deprecated warning, but desired output
my_tibble %>% rbind_list %>% mutate(sample=c("A", "B", "C")) %>% select(sample, everything())
#> Warning: 'rbind_list' is deprecated.
#> Use 'bind_rows()' instead.
#> See help("Deprecated")
#> # A tibble: 3 x 11
#>   sample     a     b     c     d     e     f     g     h     i     j
#>   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A       6.52  4.31  6.11  5.12  4.26  5.68  4.35  4.91  6.73  6.03
#> 2 B       7.97  7.56  8.27  8.00  6.86  5.52  8.06  4.06  6.69  7.86
#> 3 C       9.07  9.92 10.7   8.64  9.78  9.10 10.5   9.16  9.86 11.1


#desired output
my_tibble %>% do.call(rbind, .) %>% as.tibble() %>% mutate(sample=c("A", "B", "C")) %>% select(sample, everything())
#> # A tibble: 3 x 11
#>   sample     a     b     c     d     e     f     g     h     i     j
#>   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A       6.52  4.31  6.11  5.12  4.26  5.68  4.35  4.91  6.73  6.03
#> 2 B       7.97  7.56  8.27  8.00  6.86  5.52  8.06  4.06  6.69  7.86
#> 3 C       9.07  9.92 10.7   8.64  9.78  9.10 10.5   9.16  9.86 11.1

reprex 包(v0.2.0)于 2018 年 6 月 12 日创建。

标签: rdplyrpurrr

解决方案


这些list元素被命名为vectors。我们将其转换为 atibble然后执行bind_rowsor 使用map_df

my_tibble %>% 
      map_df(~ as.list(.x) %>% 
                     as_tibble)
# A tibble: 3 x 10
#      a     b     c     d     e     f     g     h     i     j
#  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1  7.40  4.96  5.69  5.03  4.26  5.19  3.20  6.47  5.15  7.17
#2  7.48  6.29  7.61  6.07  5.75  7.29  6.56  7.00  7.07  6.41
#3  9.43  9.86 11.2   8.48 10.6  10.3  11.1   9.70 10.4  10.3 

data.frame(与as.data.frame.list

my_tibble %>%
       map_df(as.data.frame.list)
#        a        b         c        d         e         f         g        h
#1 7.401618 4.960760  5.689739 5.028002  4.256727  5.188792  3.195041 6.465555
#2 7.475510 6.290054  7.610726 6.065902  5.746367  7.291446  6.556708 7.001105
#3 9.431331 9.864821 11.178087 8.476433 10.593946 10.332950 11.063100 9.695816
#          i         j
#1  5.153253  7.172612
#2  7.074341  6.410479
#3 10.370019 10.267099

关于第一个问题,我们可以先用mapwithinmutate再用pullcolumn

tibble(group=c("A", "B", "C"), n_seqs=c(5,7,10)) %>%
      mutate(new_col = map(n_seqs, ~ as.list(rnorm(.x, n = 10)) %>% 
                           set_names(letters[1:10]))) %>%
      pull(new_col) %>% 
      bind_rows
# A tibble: 3 x 10
#      a     b     c     d     e     f     g     h     i     j
#  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1  5.45  4.98  4.68  4.07  3.51  3.92  6.00  4.38  3.62  6.87
#2  7.43  6.76  8.06  7.89  6.38  9.21  6.74  5.58  6.86  7.21
#3 12.3  10.1  10.5   9.92  9.67  9.97 10.8  12.1  11.0  11.2 

根据评论,如果我们还需要“组”列

tibble(group= c("A", "B", "C"), n_seqs = c(5, 7, 10)) %>% 
     nest(-group) %>% 
     mutate(new_col = map(data, ~ 
                        .x %>%
                          pull(n_seqs)  %>% 
                          rnorm(., n = 10 ) %>%
                          set_names(letters[1:10]) %>% 
                          as.list %>%
                          as_tibble))  %>% 
    select(-data) %>% 
    unnest
# A tibble: 3 x 11
#  group     a     b     c     d     e     f     g     h     i     j
#  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 A      6.77  5.34  4.38  4.56  4.49  5.19  5.18  5.92  5.32  4.63
#2 B      6.06  7.63  6.94  7.18  8.10  8.75  6.05  8.64  6.13  7.27
#3 C     10.2   9.72 11.4   9.34 10.7   9.99  9.07 11.2   7.91  9.47

注意:值不同,因为我们没有设置种子


推荐阅读