首页 > 解决方案 > 如何使用 dplyr 递归连接小标题中的字符,直到字符重复

问题描述

我正在尝试使用 dplyr 连接来自先前 tibble 行的字符,直到一个字符重复。一旦一个字符重复,我们使用重复的字符再次开始相同的连接过程。这是一个表示源数据帧 (df) 我连接字符 (df1) 失败的尝试和建议的连接过程 (df2) 的期望结果的表示。

在我的尝试中,连接过程似乎只在我们创建 bf 时发生一次。不幸的是,我不确定为什么会这样。我对 dplyr 还是很陌生,所以我怀疑我遗漏了一些非常明显的东西。另外,如果有更好的方法来解决这个问题,我很乐意扩大我的视野和知识。

library (tidyverse)

df <- tibble(id = c(1,2,3,4,5,6,7,8,9,10,11,12,13,14),
             cde =c("b","f","c","e","b","f","c","e","d","f","b","c","e","d"))

df
#> # A tibble: 14 x 2
#>       id cde  
#>    <dbl> <chr>
#>  1     1 b    
#>  2     2 f    
#>  3     3 c    
#>  4     4 e    
#>  5     5 b       
#>  6     6 f    
#>  7     7 c    
#>  8     8 e    
#>  9     9 d    
#> 10    10 f    
#> 11    11 b    
#> 12    12 c    
#> 13    13 e    
#> 14    14 d

df1 <- df %>% 
  mutate(cum_cde = "") %>% 
  mutate(cum_cde = if_else(id ==1,cde,cum_cde)) %>% 
  mutate(cum_cde = if_else(id > 1 & str_count(lag(cum_cde),(cde)) < 1,str_c(lag(cum_cde),cde,sep="",collapse=NULL),cde))

df1
#> # A tibble: 14 x 3
#>       id cde   cum_cde
#>    <dbl> <chr> <chr>  
#>  1     1 b     b      
#>  2     2 f     bf     
#>  3     3 c     c      
#>  4     4 e     e      
#>  5     5 b     b      
#>  6     6 f     f      
#>  7     7 c     c      
#>  8     8 e     e      
#>  9     9 d     d      
#> 10    10 f     f      
#> 11    11 b     b      
#> 12    12 c     c      
#> 13    13 e     e      
#> 14    14 d     d



df2 <- tibble(id = c(1,2,3,4,5,6,7,8,9,10,11,12,13,14),
             cde =c("b","f","c","e","b","f","c","e","d","f","b","c","e","d"),
             result = c("b","bf","bfc","bfce","b","bf","bfc","bfce","bfced","f","fb","fbc","fbce","fbced"))

df2
#> # A tibble: 14 x 3
#>       id cde   result
#>    <dbl> <chr> <chr> 
#>  1     1 b     b     
#>  2     2 f     bf    
#>  3     3 c     bfc   
#>  4     4 e     bfce  
#>  5     5 b     b     
#>  6     6 f     bf    
#>  7     7 c     bfc   
#>  8     8 e     bfce  
#>  9     9 d     bfced 
#> 10    10 f     f     
#> 11    11 b     fb    
#> 12    12 c     fbc   
#> 13    13 e     fbce  
#> 14    14 d     fbced


<sup>Created on 2019-12-23 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>

标签: rdplyr

解决方案


带有for循环的选项是

library(stringr)
v1 <- character(nrow(df))
j <- 1
for(i in seq_len(nrow(df))) {
               v1[i] <- paste(df$cde[unique(j:i)], collapse="")
     if(str_count(v1[i], df$cde[i]) > 1) {
             v1[i] <- df$cde[i]
  j <- i
       }
   }

v1
#[1] "b"     "bf"    "bfc"   "bfce" 
#[5] "b"     "bf"    "bfc"   "bfce"  "bfced" 
#[10]"f"     "fb"    "fbc"   "fbce"  "fbced"

或使用accumulate

library(purrr)
library(dplyr)
df %>%
   group_by(grp = cummax(str_count(accumulate(cde, str_c), cde))) %>% 
   mutate(result = accumulate(cde, str_c)) %>%
   ungroup %>% 
   select(-grp)
# A tibble: 14 x 3
#      id cde   result
#   <dbl> <chr> <chr> 
# 1     1 b     b     
# 2     2 f     bf    
# 3     3 c     bfc   
# 4     4 e     bfce  
# 5     5 b     b     
# 6     6 f     bf    
# 7     7 c     bfc   
# 8     8 e     bfce  
# 9     9 d     bfced 
#10    10 f     f     
#11    11 b     fb    
#12    12 c     fbc   
#13    13 e     fbce  
#14    14 d     fbced 

推荐阅读