首页 > 解决方案 > 在 R / tidyr 中处理巨大的嵌套数据集

问题描述

[编辑:2021 年 5 月 3 日,数据集现在包括以前的解决方案无法处理的现实条件。对于更改数据,我深表歉意,但我没有看到更好的方法来澄清先前建议的解决方案中的差距。]

xf = read.table(header=T, sep=",", text="
Input,Time,Word,Copy1,Copy2,Copy3,Copy30
ark,10,ark,-0.1,-0.1,-0.1,-0.1
ark,20,ark,0.0,0.5,0.55,0.01
ark,30,ark,0.01,0.1,0.2,0.05
ark,40,ark,0.02,0.3,0.5,0.1
ark,50,ark,0.01,0.2,0.4,-0.1
ark,10,ad,-0.1,-0.1,-0.1,-0.1
ark,20,ad,0.0,0.01,0.02,0.01
ark,30,ad,0.01,0.03,0.1,0.04
ark,40,ad,0.02,0.12,0.15,0.04
ark,50,ad,0.01,0.01,0.05,0.02
ark,10,bark,-0.1,-0.1,-0.1,-0.1
ark,20,bark,0.02,0.12,0.1,0.01
ark,30,bark,0.03,0.15,0.12,0.02
ark,40,bark,0.02,0.22,0.1,0.03
ark,50,bark,0.01,0.1,0.05,0.02
ark,10,bar,-0.1,-0.1,-0.1,-0.1
ark,20,bar,0.01,0.1,0.02,-0.05
ark,30,bar,0.01,0.12,0.03,0
ark,40,bar,0.02,0.15,0.03,0.01
ark,50,bar,0.01,0.05,0.02,0.01
bark,10,ark,-0.1,-0.1,-0.1,-0.1
bark,20,ark,0.0,0.04,0.05,0.01
bark,30,ark,0.01,0.08,0.1,0.05
bark,40,ark,0.02,0.05,0.2,0.1
bark,50,ark,0.01,0.01,0.3,-0.1
bark,10,ad,-0.1,-0.1,-0.1,-0.1
bark,20,ad,0.0,0.01,0.01,0.01
bark,30,ad,0.01,0.02,0.05,0.04
bark,40,ad,0.02,0.03,0.06,0.04
bark,50,ad,0.01,0.02,0.01,0.02
bark,10,bark,-0.1,-0.1,-0.1,-0.1
bark,20,bark,0.02,0.15,0.1,0.01
bark,30,bark,0.03,0.3,0.12,0.02
bark,40,bark,0.02,0.7,0.1,0.03
bark,50,bark,0.01,0.7,0.05,0.02
bark,10,bar,-0.1,-0.1,-0.1,-0.1
bark,20,bar,0.01,0.13,0.04,-0.05
bark,30,bar,0.01,0.25,0.06,0
bark,40,bar,0.02,0.4,0.08,0.01
bark,50,bar,0.01,0.35,0.01,0.01
") %>% arrange(Input,Word,Time)

我想以两种方式减少这些数据。

(1) 对于每个 Input x Word 组合,根据整个时间序列的最大值为一个单词选择一个Copy,并且

(2) 基于保留副本的最大值(每个输入 x 字 1 个),减少到“topX”字。


我最初的问题不清楚并且变得非常笨拙。@DanChaltiel 使用非常接近完整解决方案的 pivot_longer 提供了部分答案,但我无法清楚地解释第一次减少。所以我把它分解成一个单独的问题,@akrun 像这样扩展了@DanChaltiel 的解决方案,解决了第一部分(2021 年 5 月 3 日更新以反映对解决方案的修复):

library(tidyverse)
# Reduce data to one Copy of each Input x Word combination
# based on maxima for entire time series, no matter what
# Time those maxima occur. Using pivot_longer was due to 
# answer from @DanChaltiel, but getting it to work on 
# Input x Word maxima over the whole time series (rather 
# than maxima of Input x Word x Time) was due to @akrun 
# for https://stackoverflow.com/questions/67351185/
xf2 <- xf %>% 
  pivot_longer(cols = starts_with('Copy'), names_to = 'copy_name', 
               values_to = 'Value') %>% 
  group_by(Input, Time, Word) %>% 
  arrange(Value) %>%
  slice(if(all(Value <= 0)) n() 
        else tail(which(Value > 0), 1))%>% 
  group_by(Input, Word) %>% 
  mutate(copy_name = copy_name[which.max(Value)]) %>%
  ungroup

print((xf2 %>% arrange(Input, Word)), n = nrow(xf2)) # print all rows

# A tibble: 40 x 5
# Input  Time Word  copy_name Value
# <fct> <int> <fct> <chr>     <dbl>
# 1 ark       10 ad    Copy3     -0.1 
# 2 ark       20 ad    Copy3      0.02
# 3 ark       30 ad    Copy3      0.1 
# 4 ark       40 ad    Copy3      0.15
# 5 ark       50 ad    Copy3      0.05
# 6 ark       10 ark   Copy3     -0.1 
# 7 ark       20 ark   Copy3      0.55
# 8 ark       30 ark   Copy3      0.2 
# 9 ark       40 ark   Copy3      0.5 
# 10 ark      50 ark   Copy3      0.4 
# 11 ark      10 bar   Copy2     -0.1 
# 12 ark      20 bar   Copy2      0.1 
# 13 ark      30 bar   Copy2      0.12
# 14 ark      40 bar   Copy2      0.15
# 15 ark      50 bar   Copy2      0.05
# 16 ark      10 bark  Copy2     -0.1 
# 17 ark      20 bark  Copy2      0.12
# 18 ark      30 bark  Copy2      0.15
# 19 ark      40 bark  Copy2      0.22
# 20 ark      50 bark  Copy2      0.1 
# 21 bark     10 ad    Copy3     -0.1 
# 22 bark     20 ad    Copy3      0.01
# 23 bark     30 ad    Copy3      0.05
# 24 bark     40 ad    Copy3      0.06
# 25 bark     50 ad    Copy3      0.02
# 26 bark     10 ark   Copy3     -0.1 
# 27 bark     20 ark   Copy3      0.05
# 28 bark     30 ark   Copy3      0.1 
# 29 bark     40 ark   Copy3      0.2 
# 30 bark     50 ark   Copy3      0.3 
# 31 bark     10 bar   Copy2     -0.1 
# 32 bark     20 bar   Copy2      0.13
# 33 bark     30 bar   Copy2      0.25
# 34 bark     40 bar   Copy2      0.4 
# 35 bark     50 bar   Copy2      0.35
# 36 bark     10 bark  Copy2     -0.1 
# 37 bark     20 bark  Copy2      0.15
# 38 bark     30 bark  Copy2      0.3 
# 39 bark     40 bark  Copy2      0.7 
# 40 bark     50 bark  Copy2      0.7 

因此,这成功地将基于时间 1..100 系列中的最大值的每个 Input x Word 组合的数据减少到一个副本。

第二个挑战是将数据减少到每个输入的前 X 个单词。

@AnilGoyal 建议的方法适用于更简单的样本数据,但由于包含的时间步数和 topX 的值之间存在偶然的偶然性。

到目前为止,基于@AnilGoyal 的示例,我能够做的是根据每个输入的最大值识别topX 单词。以下是查找前 3 名和前 2 名的 2 个示例:

topX = 3
xftop3 <- xf2 %>% group_by(Input, Word) %>%
  slice_max(Value, with_ties=FALSE) %>%
  arrange(desc(Value)) %>%
  group_by(Input) %>%
  filter(1:n() <= topX) %>%
  arrange(Input, Value)

xftop3

# A tibble: 6 x 5
# Groups:   Input [2]
# Input  Time Word  copy_name Value
# <fct> <int> <fct> <chr>     <dbl>
# 1 ark      40 ad    Copy3      0.15
# 2 ark      40 bark  Copy2      0.22
# 3 ark      20 ark   Copy3      0.55
# 4 bark     50 ark   Copy3      0.3 
# 5 bark     40 bar   Copy2      0.4 
# 6 bark     40 bark  Copy2      0.7 

topX = 2
xftop2 <- xf2 %>% group_by(Input, Word) %>%
  slice_max(Value, with_ties=FALSE) %>%
  arrange(desc(Value)) %>%
  group_by(Input) %>%
  filter(1:n() <= topX) %>%
  arrange(Input, Value)

xftop2

# A tibble: 4 x 5
# Groups:   Input [2]
# Input  Time Word  copy_name Value
# <fct> <int> <fct> <chr>     <dbl>
# 1 ark      40 bark  Copy2      0.22
# 2 ark      20 ark   Copy3      0.55
# 3 bark     40 bar   Copy2      0.4 
# 4 bark     40 bark  Copy2      0.7 

然后我不知道该怎么做是使用那个小标题将数据集减少到只有那些 Input x Word 组合。样本数据和 topX = 2 的所需输出将是:

# A tibble: 20 x 5
   Input  Time Word  copy_name Value
   <fct> <int> <fct> <chr>     <dbl>
 1 ark      10 ark   Copy3     -0.1 
 2 ark      20 ark   Copy3      0.55
 3 ark      30 ark   Copy3      0.2 
 4 ark      40 ark   Copy3      0.5 
 5 ark      50 ark   Copy3      0.4 
 6 ark      10 bark  Copy2     -0.1 
 7 ark      20 bark  Copy2      0.12
 8 ark      30 bark  Copy2      0.15
 9 ark      40 bark  Copy2      0.22
10 ark      50 bark  Copy2      0.1 
11 bark     10 bar   Copy2     -0.1 
12 bark     20 bar   Copy2      0.13
13 bark     30 bar   Copy2      0.25
14 bark     40 bar   Copy2      0.4 
15 bark     50 bar   Copy2      0.35
16 bark     10 bark  Copy2     -0.1 
17 bark     20 bark  Copy2      0.15
18 bark     30 bark  Copy2      0.3 
19 bark     40 bark  Copy2      0.7 
20 bark     50 bark  Copy2      0.7 

我将不胜感激任何建议。

标签: rdplyrtidyr

解决方案


到目前为止,我所了解的是,您semi_join只需要根据右数据参数中可用的 row_combinations 过滤数据(左侧)。

topX = 2
semi_join(xf2, xf2 %>% group_by(Input, Word) %>%
  slice_max(Value, with_ties=FALSE) %>%
  group_by(Input) %>%
  slice_max(Value, n= topX, with_ties = FALSE) %>%
  select(Input, Word), by = c('Input', 'Word'))

# A tibble: 20 x 5
   Input  Time Word  copy_name Value
   <chr> <int> <chr> <chr>     <dbl>
 1 ark      10 ark   Copy3     -0.1 
 2 ark      10 bark  Copy2     -0.1 
 3 ark      20 ark   Copy3      0.55
 4 ark      20 bark  Copy2      0.12
 5 ark      30 ark   Copy3      0.2 
 6 ark      30 bark  Copy2      0.15
 7 ark      40 ark   Copy3      0.5 
 8 ark      40 bark  Copy2      0.22
 9 ark      50 ark   Copy3      0.4 
10 ark      50 bark  Copy2      0.1 
11 bark     10 bar   Copy2     -0.1 
12 bark     10 bark  Copy2     -0.1 
13 bark     20 bar   Copy2      0.13
14 bark     20 bark  Copy2      0.15
15 bark     30 bar   Copy2      0.25
16 bark     30 bark  Copy2      0.3 
17 bark     40 bar   Copy2      0.4 
18 bark     40 bark  Copy2      0.7 
19 bark     50 bar   Copy2      0.35
20 bark     50 bark  Copy2      0.7 

您的第二部分将通过此代码解决

topX <- 2L
xf2 %>% semi_join(xf2 %>% group_by(Input) %>%
  slice_max(Value, n= topX) %>% select(Input, Word), by = c("Input", "Word"))

# A tibble: 12 x 5
   Input  Time Word  copy_name Value
   <chr> <int> <chr> <chr>     <dbl>
 1 ark       1 ark   Copy3      0   
 2 ark       1 bark  Copy2      0   
 3 ark      50 ark   Copy3      0.05
 4 ark      50 bark  Copy2      0.06
 5 ark     100 ark   Copy3      0.55
 6 ark     100 bark  Copy2      0.2 
 7 bark      1 bar   Copy2      0   
 8 bark      1 bark  Copy2      0   
 9 bark     50 bar   Copy2      0.7 
10 bark     50 bark  Copy2      0.75
11 bark    100 bar   Copy2      0.4 
12 bark    100 bark  Copy2      0.6

我认为作为第一部分你也可以使用它semi_join请检查

xf %>% 
  pivot_longer(cols = starts_with('Copy'), names_to = 'copy_name', 
               values_to = 'Value') %>% 
  semi_join(xf %>% pivot_longer(cols = starts_with('Copy'), names_to = 'copy_name', 
                                values_to = 'Value') %>% 
              group_by(Input, Word) %>%
              slice_max(Value) %>%
              select(Input, Word, copy_name), 
            by = c('Input', 'Word', 'copy_name')) -> xf2


推荐阅读