首页 > 解决方案 > 按唯一标识符折叠多行,多个值(列),每个变量的多个元素(行)

问题描述

我试图通过 R 中的唯一标识符折叠多行,但每列可能有多个变量,我想将它们保存在单独的行中。我怎么做?我试过“separate_rows”函数,但它说每个嵌套列必须有相同数量的元素。我的没有。

## Example1 start
sample_A.1 <- c ("sample_A", "","D","")
sample_A.2 <- c("sample_A", "","D","")    
sample_A.3 <- c("sample_A", "","","red")    
sample_B.1 <- c("sample_B", 28, "", "")
sample_B.2 <- c("sample_B", "", "D", "")
sample_B.3 <- c("sample_B", "", "N", "")
sample_B.4 <- c("sample_B", "", "", "orange")
sample_C.1 <- c("sample_C", 27, "", "")
sample_C.2 <- c("sample_C", 32, "", "")
sample_C.3 <- c("sample_C", "", "E", "")
sample_C.4 <- c("sample_C", "", "", "orange")
sample_C.5 <- c("sample_C", "", "", "yellow")
sample_C.6 <- c("sample_C", "", "", "green")
my.data <- data.frame(rbind(sample_A.1,sample_A.2,sample_A.3,sample_B.1,sample_B.2,sample_B.3,sample_B.4,sample_C.1,sample_C.2,sample_C.3,sample_C.4,sample_C.5,sample_C.6), stringsAsFactors = FALSE)
colnames(my.data) <- c("identifier", "var1", "var2", "var3")

# identifier   var1   var2   var3
# sample_A     30
# sample_A            D
# sample_A                   red
# sample_B     28
# sample_B            D
# sample_B            N
# sample_B                   orange
# sample_C     27
# sample_C     32
# sample_C            E
# sample_C                   orange
# sample_C                   yellow
# sample_C                   green

## Or

## Example2 start
sample_A <- c("sample_A", 30, "D", "red")
sample_B <- c("sample_B", 28, "D, N", "orange")
sample_C <- c("sample_C", "27, 32", "E", "orange, yellow, green")
my.data2 <- data.frame(rbind(sample_A, sample_B, sample_C), stringsAsFactors = FALSE)
colnames(my.data2) <- c("identifier", "var1", "var2", "var3")

# identifier   var1   var2   var3
# sample_A     30     D      red
# sample_B     28     D, N   orange
# sample_C     27, 32 E      orange, yellow, green

为了从上面的示例 1 到示例 2,我使用了以下代码(需要 plyr 或 dplyr)...

library(dplyr)
my.data2a <- my.data %>% group_by(identifier) %>%
    summarize_all(funs(paste(.[!is.na(.)], collapse = " ")))
my.data2 <- data.frame(lapply(my.data2a, trimws), stringsAsFactors=FALSE)
my.data2[2:4] <- sapply(my.data2[2:4], gsub, pattern=" ", replacement=", ")

我尝试使用以下代码将 my.data2 中的数据融合到单独的行中......(需要 tidyr 和 tibble)

data2colnames <- c("var1", "var2", "var3")
my.data2 %>% separate_rows(data2colnames)

但我收到错误:“错误:所有嵌套列必须具有相同数量的元素。” 我需要到达上面“示例输出”链接中的“结束于”表,该表基本上如下所示:

identifier  var1  var2  var3
sample_A    30    D     red
sample_B.1  28    D     orange
sample_B.2        N
sample_C.1  27    E     orange
sample_C.2  32          yellow
sample_C.3              green

标签: raggregatecsv

解决方案


尝试了一些丑陋的东西。一定有更漂亮的东西。

library(purrr)
df=my.data
df[df==""]=NA

x= df %>% split(.$identifier) %>% map(function(x)sapply(x, function(y)sort(unique(y),dec=T)))
xx=map_dfr(x, function(x){
    mx=max(sapply(x, length))
    z=lapply(x, function(y){
        if(length(y)>0 & startsWith(y[1],"sample")){
            c(y,rep(y[1], mx-length(y)))
        }else
            c(y,rep("", mx-length(y)))
    })  
}) %>% as.data.frame

nm=xx$identifier
xx$identifier = unlist(lapply(split(nm,nm), function(x)if(length(x)>1)paste0(x,".",1:length(x)) else x ))

xx

推荐阅读