首页 > 解决方案 > 如何根据输出列中的值提取列名并获取计数

问题描述

我有一个关于 R 中的数据框操作的问题,以根据输出列中用逗号分隔的值提取列名并获取计数。

我有一个输入文件,其中 A 列中包含基因,其他列中包含文献 ID(输入文件示例如下所示)。我想要的是收集所有具有value = 1in output 列的文献 ID,并计算 count 列中的 ID 数量(输出文件示例如下所示)。发布此消息后,我将使用此输出文件将数据帧与使用该merge函数的我感兴趣的基因列表合并。请帮我解决这个问题。

Input_data <- read.csv(file = "./Input.csv", stringsAsFactors = FALSE, check.names = FALSE)
Output_data <- read.csv(file = "./Output.csv", stringsAsFactors = FALSE, check.names = FALSE)
Genes <- read.csv(file = "./Genes.csv", stringsAsFactors = FALSE, check.names = FALSE)

Merge_data <- merge(Output_data, Genes, by = "Genes")


Input_data

dput(Input_data)
structure(list(Genes = c("Gene_A", "Gene_B", "Gene_C", "Gene_D", 
"Gene_E", "Gene_F", "Gene_G", "Gene_H", "Gene_I", "Gene_J", "Gene_K", 
"Gene_L", "Gene_M"), `20706538` = c(0L, 1L, 1L, 1L, 0L, 1L, 1L, 
1L, 0L, 0L, 0L, 0L, 0L), `14557386` = c(0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L), `22999554` = c(0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), `21906313` = c(1L, 1L, 1L, 1L, 
0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L), `25229268` = c(1L, 1L, 1L, 
0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L), `22633082` = c(0L, 1L, 
1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L), `19228761` = c(1L, 
1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), `19543402` = c(0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), `26955776` = c(1L, 
1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L), `21126355` = c(1L, 
1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L)), class = "data.frame", row.names = c(NA, 
-13L))


Output_data

dput(Output_data)
structure(list(Genes = c("Gene_A", "Gene_B", "Gene_C", "Gene_D", 
"Gene_E", "Gene_F", "Gene_G", "Gene_H", "Gene_I", "Gene_J", "Gene_K", 
"Gene_L", "Gene_M"), Output = c("21906313, 25229268, 19228761, 26955776, 21126355", 
"20706538, 21906313, 25229268, 22633082, 19228761, 26955776, 21126355", 
"20706538, 21906313, 25229268, 22633082, 19228761, 26955776, 21126355", 
"20706538, 21906313, 22633082, 19228761, 26955776, 21126355", 
"", "20706538, 21906313, 25229268, 22633082, 26955776, 21126355", 
"20706538, 21906313, 25229268, 22633082, 19228761, 26955776, 21126355", 
"20706538, 21906313, 25229268, 22633082, 26955776, 21126355", 
"", "", "", "", "21906313, 21126355"), Counts = c(5L, 7L, 7L, 
6L, 0L, 6L, 7L, 6L, 0L, 0L, 0L, 0L, 2L)), class = "data.frame", row.names = c(NA, 
-13L))

Genes
dput(Genes)
structure(list(Genes = c("Gene_A", "Gene_B", "Gene_C", "Gene_D", 
"Gene_E", "Gene_F", "Gene_G", "Gene_H", "Gene_I", "Gene_J", "Gene_K", 
"Gene_L", "Gene_M", "Gene_N", "Gene_O", "Gene_P", "Gene_Q", "Gene_R", 
"Gene_S", "Gene_T", "Gene_U", "Gene_V", "Gene_W")), class = "data.frame", row.names = c(NA, 
-23L))

标签: rdataframemergedplyrtidyr

解决方案


您的数据是宽格式的,这意味着一行/观察有多个值。当您的数据采用长格式时会更容易,这意味着每行只有一个值。看看整洁的数据

我的解决方案与@Ric S 非常相似,而不是mutate我使用summarise它是为这样的情况而设计的,在这种情况下,您希望分组变量的每个级别只有一个条目:

Input_data <- structure(list(Genes = c("Gene_A", "Gene_B", "Gene_C", "Gene_D", 
                         "Gene_E", "Gene_F", "Gene_G", "Gene_H", "Gene_I", "Gene_J", "Gene_K", 
                         "Gene_L", "Gene_M"), `20706538` = c(0L, 1L, 1L, 1L, 0L, 1L, 1L, 
                                                             1L, 0L, 0L, 0L, 0L, 0L), `14557386` = c(0L, 0L, 0L, 0L, 0L, 0L, 
                                                                                                     0L, 0L, 0L, 0L, 0L, 0L, 0L), `22999554` = c(0L, 0L, 0L, 0L, 0L, 
                                                                                                                                                 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), `21906313` = c(1L, 1L, 1L, 1L, 
                                                                                                                                                                                                 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L), `25229268` = c(1L, 1L, 1L, 
                                                                                                                                                                                                                                                     0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L), `22633082` = c(0L, 1L, 
                                                                                                                                                                                                                                                                                                             1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L), `19228761` = c(1L, 
                                                                                                                                                                                                                                                                                                                                                                         1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), `19543402` = c(0L, 
                                                                                                                                                                                                                                                                                                                                                                                                                                         0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), `26955776` = c(1L, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L), `21126355` = c(1L, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L)), class = "data.frame", row.names = c(NA, 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               -13L))

Genes <- structure(list(Genes = c("Gene_A", "Gene_B", "Gene_C", "Gene_D", 
                                  "Gene_E", "Gene_F", "Gene_G", "Gene_H", "Gene_I", "Gene_J", "Gene_K", 
                                  "Gene_L", "Gene_M", "Gene_N", "Gene_O", "Gene_P", "Gene_Q", "Gene_R", 
                                  "Gene_S", "Gene_T", "Gene_U", "Gene_V", "Gene_W")), class = "data.frame", row.names = c(NA, 
                                                                                                                          -23L))

library(dplyr)
library(tidyr)

summary_data <- Input_data %>% 
  pivot_longer(-Genes, values_to = "is_contained", names_to = "literature_id") %>% 
  group_by(Genes) %>% 
  filter(is_contained == 1) %>% 
  summarise(Output = paste0(literature_id, collapse = ", "),
            Counts = n()) %>% 
  right_join(Genes) %>% 
  mutate(Output = if_else(is.na(Output),
                          "",
                          Output),
         Counts = if_else(is.na(Counts),
                          0L,
                          Counts))

summary_data
# A tibble: 23 x 3
   Genes  Output                                                                 Counts
   <chr>  <chr>                                                                   <int>
 1 Gene_A "21906313, 25229268, 19228761, 26955776, 21126355"                          5
 2 Gene_B "20706538, 21906313, 25229268, 22633082, 19228761, 26955776, 21126355"      7
 3 Gene_C "20706538, 21906313, 25229268, 22633082, 19228761, 26955776, 21126355"      7
 4 Gene_D "20706538, 21906313, 22633082, 19228761, 26955776, 21126355"                6
 5 Gene_E ""                                                                          0
 6 Gene_F "20706538, 21906313, 25229268, 22633082, 26955776, 21126355"                6
 7 Gene_G "20706538, 21906313, 25229268, 22633082, 19228761, 26955776, 21126355"      7
 8 Gene_H "20706538, 21906313, 25229268, 22633082, 26955776, 21126355"                6
 9 Gene_I ""                                                                          0
10 Gene_J ""                                                                          0
# ... with 13 more rows

推荐阅读