首页 > 解决方案 > 我无法根据字符扩展行

问题描述

我有这样的数据

df <- structure(list(Division = structure(c(1L, 1L, 1L, 2L, 2L, 3L), .Label = c("Main data", 
"Second data", "Third data"), class = "factor"), Gene = structure(1:6, .Label = c("ABI3BP", 
"ADIPOQ", "AEBP1", "AGRN", "AMBN", "AMELX"), class = "factor"), 
    IDs = c(17265L, 13633L, 303L, 329L, 452L, 461L), IDs.Links = c(17265L, 
    13633L, 303L, 329L, 452L, 461L), UniID = structure(c(1L, 
    4L, 2L, 3L, 6L, 5L), .Label = c("B4DSV9:D3YTG3:E9PPR9:E9PRB5:H0Y897", 
    "C9JLQ8:H7C0W8:H7C1J5", "H0Y5U1:O00468", "Q15848", "Q99217", 
    "Q9NP70"), class = "factor"), Refseq_IDs = structure(c(4L, 
    3L, 1L, 6L, 5L, 2L), .Label = c("NP_001120.3", "NP_001133.1:NP_872621.1:NP_872622.1", 
    "NP_001171271.1:NP_004788.1", "NP_056244.2:XP_005247340.1", 
    "NP_057603.1", "NP_940978.2:XP_005244806.1:XP_006710696.1"
    ), class = "factor"), Orthology = structure(1:6, .Label = c("Mouse:Abi3bp|", 
    "Mouse:Adipoq|", "Mouse:Aebp1|", "Mouse:Agrn|", "Mouse:Ambn|", 
    "Mouse:Amelx|"), class = "factor")), class = "data.frame", row.names = c(NA, 
-6L))

在名为 UniID 的一列中,我有许多由 a 分隔的字符串 我想将它们中的每一个放在一个新行中并重复其他列

愿望输出看起来像这样

df2 <-structure(list(Division = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 2L, 2L, 2L, 3L), .Label = c("Main data", "Second data", 
"Third data"), class = "factor"), Gene = structure(c(1L, 1L, 
1L, 1L, 1L, 2L, 3L, 4L, 5L, 6L, 6L, 7L, 8L), .Label = c("ABI3BP", 
"ADIPOQ", "AEBP1", "AEBP2", "AEBP3", "AGRN", "AMBN", "AMELX"), class = "factor"), 
    IDs = c(17265L, 17265L, 17265L, 17265L, 17265L, 13633L, 303L, 
    303L, 303L, 329L, 329L, 452L, 461L), IDs.Links = c(17265L, 
    17265L, 17265L, 17265L, 17265L, 13633L, 303L, 303L, 303L, 
    329L, 329L, 452L, 461L), UniID = structure(c(1L, 3L, 4L, 
    5L, 7L, 11L, 2L, 8L, 9L, 6L, 10L, 13L, 12L), .Label = c("B4DSV9", 
    "C9JLQ8", "D3YTG3", "E9PPR9", "E9PRB5", "H0Y5U1", "H0Y897", 
    "H7C0W8", "H7C1J5", "O00468", "Q15848", "Q99217", "Q9NP70"
    ), class = "factor"), Refseq_IDs = structure(c(4L, 4L, 4L, 
    4L, 4L, 3L, 1L, 1L, 1L, 6L, 7L, 5L, 2L), .Label = c("NP_001120.3", 
    "NP_001133.1:NP_872621.1:NP_872622.1", "NP_001171271.1:NP_004788.1", 
    "NP_056244.2:XP_005247340.1", "NP_057603.1", "NP_940978.2:XP_005244806.1:XP_006710696.1", 
    "NP_940978.2:XP_005244806.1:XP_006710696.2"), class = "factor"), 
    Orthology = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 3L, 3L, 3L, 
    4L, 4L, 5L, 6L), .Label = c("Mouse:Abi3bp|", "Mouse:Adipoq|", 
    "Mouse:Aebp1|", "Mouse:Agrn|", "Mouse:Ambn|", "Mouse:Amelx|"
    ), class = "factor")), class = "data.frame", row.names = c(NA, 
-13L))

我找到了其他帖子,我试图这样做,但没有任何成功。

s <- strsplit(as.character(df$UniID), ':')
mydf<-data.frame(director=unlist(s), IDs=rep(df$IDs, lengths(s)))

只给我 IDs 和 UniIDs 的列

mydf<- df[, lapply(.SD, function(x) unlist(tstrsplit(x, ":", fixed=TRUE))), by = IDs][!is.na(UniID)]
Error in `[.data.frame`(df, , lapply(.SD, function(x) unlist(tstrsplit(x,  : 
  unused argument (by = IDs)

这个

mydf<- df[, strsplit(as.character(UniID), ":", fixed=TRUE), 
+           by = .(IDs, UniID)][,.(UniID = V1, IDs)]
Error in `[.data.frame`(df, , strsplit(as.character(UniID), ":", fixed = TRUE),  : 
  unused argument (by = .(IDs, UniID))

标签: r

解决方案


一种dplyr可能:

df %>%
 mutate(UniID = strsplit(as.character(UniID), ":")) %>%
 unnest()

      Division   Gene   IDs IDs.Links                                Refseq_IDs     Orthology  UniID
1    Main data ABI3BP 17265     17265                NP_056244.2:XP_005247340.1 Mouse:Abi3bp| B4DSV9
2    Main data ABI3BP 17265     17265                NP_056244.2:XP_005247340.1 Mouse:Abi3bp| D3YTG3
3    Main data ABI3BP 17265     17265                NP_056244.2:XP_005247340.1 Mouse:Abi3bp| E9PPR9
4    Main data ABI3BP 17265     17265                NP_056244.2:XP_005247340.1 Mouse:Abi3bp| E9PRB5
5    Main data ABI3BP 17265     17265                NP_056244.2:XP_005247340.1 Mouse:Abi3bp| H0Y897
6    Main data ADIPOQ 13633     13633                NP_001171271.1:NP_004788.1 Mouse:Adipoq| Q15848
7    Main data  AEBP1   303       303                               NP_001120.3  Mouse:Aebp1| C9JLQ8
8    Main data  AEBP1   303       303                               NP_001120.3  Mouse:Aebp1| H7C0W8
9    Main data  AEBP1   303       303                               NP_001120.3  Mouse:Aebp1| H7C1J5
10 Second data   AGRN   329       329 NP_940978.2:XP_005244806.1:XP_006710696.1   Mouse:Agrn| H0Y5U1
11 Second data   AGRN   329       329 NP_940978.2:XP_005244806.1:XP_006710696.1   Mouse:Agrn| O00468
12 Second data   AMBN   452       452                               NP_057603.1   Mouse:Ambn| Q9NP70
13  Third data  AMELX   461       461       NP_001133.1:NP_872621.1:NP_872622.1  Mouse:Amelx| Q99217

在这里,它将“UniID”列拆分:然后取消嵌套。


推荐阅读