r - 如何根据输出列中的值提取列名并获取计数
问题描述
我有一个关于 R 中的数据框操作的问题,以根据输出列中用逗号分隔的值提取列名并获取计数。
我有一个输入文件,其中 A 列中包含基因,其他列中包含文献 ID(输入文件示例如下所示)。我想要的是收集所有具有value = 1
in output 列的文献 ID,并计算 count 列中的 ID 数量(输出文件示例如下所示)。发布此消息后,我将使用此输出文件将数据帧与使用该merge
函数的我感兴趣的基因列表合并。请帮我解决这个问题。
Input_data <- read.csv(file = "./Input.csv", stringsAsFactors = FALSE, check.names = FALSE)
Output_data <- read.csv(file = "./Output.csv", stringsAsFactors = FALSE, check.names = FALSE)
Genes <- read.csv(file = "./Genes.csv", stringsAsFactors = FALSE, check.names = FALSE)
Merge_data <- merge(Output_data, Genes, by = "Genes")
Input_data
dput(Input_data)
structure(list(Genes = c("Gene_A", "Gene_B", "Gene_C", "Gene_D",
"Gene_E", "Gene_F", "Gene_G", "Gene_H", "Gene_I", "Gene_J", "Gene_K",
"Gene_L", "Gene_M"), `20706538` = c(0L, 1L, 1L, 1L, 0L, 1L, 1L,
1L, 0L, 0L, 0L, 0L, 0L), `14557386` = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L), `22999554` = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), `21906313` = c(1L, 1L, 1L, 1L,
0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L), `25229268` = c(1L, 1L, 1L,
0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L), `22633082` = c(0L, 1L,
1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L), `19228761` = c(1L,
1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), `19543402` = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), `26955776` = c(1L,
1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L), `21126355` = c(1L,
1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L)), class = "data.frame", row.names = c(NA,
-13L))
Output_data
dput(Output_data)
structure(list(Genes = c("Gene_A", "Gene_B", "Gene_C", "Gene_D",
"Gene_E", "Gene_F", "Gene_G", "Gene_H", "Gene_I", "Gene_J", "Gene_K",
"Gene_L", "Gene_M"), Output = c("21906313, 25229268, 19228761, 26955776, 21126355",
"20706538, 21906313, 25229268, 22633082, 19228761, 26955776, 21126355",
"20706538, 21906313, 25229268, 22633082, 19228761, 26955776, 21126355",
"20706538, 21906313, 22633082, 19228761, 26955776, 21126355",
"", "20706538, 21906313, 25229268, 22633082, 26955776, 21126355",
"20706538, 21906313, 25229268, 22633082, 19228761, 26955776, 21126355",
"20706538, 21906313, 25229268, 22633082, 26955776, 21126355",
"", "", "", "", "21906313, 21126355"), Counts = c(5L, 7L, 7L,
6L, 0L, 6L, 7L, 6L, 0L, 0L, 0L, 0L, 2L)), class = "data.frame", row.names = c(NA,
-13L))
Genes
dput(Genes)
structure(list(Genes = c("Gene_A", "Gene_B", "Gene_C", "Gene_D",
"Gene_E", "Gene_F", "Gene_G", "Gene_H", "Gene_I", "Gene_J", "Gene_K",
"Gene_L", "Gene_M", "Gene_N", "Gene_O", "Gene_P", "Gene_Q", "Gene_R",
"Gene_S", "Gene_T", "Gene_U", "Gene_V", "Gene_W")), class = "data.frame", row.names = c(NA,
-23L))
解决方案
您的数据是宽格式的,这意味着一行/观察有多个值。当您的数据采用长格式时会更容易,这意味着每行只有一个值。看看整洁的数据。
我的解决方案与@Ric S 非常相似,而不是mutate
我使用summarise
它是为这样的情况而设计的,在这种情况下,您希望分组变量的每个级别只有一个条目:
Input_data <- structure(list(Genes = c("Gene_A", "Gene_B", "Gene_C", "Gene_D",
"Gene_E", "Gene_F", "Gene_G", "Gene_H", "Gene_I", "Gene_J", "Gene_K",
"Gene_L", "Gene_M"), `20706538` = c(0L, 1L, 1L, 1L, 0L, 1L, 1L,
1L, 0L, 0L, 0L, 0L, 0L), `14557386` = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L), `22999554` = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), `21906313` = c(1L, 1L, 1L, 1L,
0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L), `25229268` = c(1L, 1L, 1L,
0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L), `22633082` = c(0L, 1L,
1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L), `19228761` = c(1L,
1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), `19543402` = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), `26955776` = c(1L,
1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L), `21126355` = c(1L,
1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L)), class = "data.frame", row.names = c(NA,
-13L))
Genes <- structure(list(Genes = c("Gene_A", "Gene_B", "Gene_C", "Gene_D",
"Gene_E", "Gene_F", "Gene_G", "Gene_H", "Gene_I", "Gene_J", "Gene_K",
"Gene_L", "Gene_M", "Gene_N", "Gene_O", "Gene_P", "Gene_Q", "Gene_R",
"Gene_S", "Gene_T", "Gene_U", "Gene_V", "Gene_W")), class = "data.frame", row.names = c(NA,
-23L))
library(dplyr)
library(tidyr)
summary_data <- Input_data %>%
pivot_longer(-Genes, values_to = "is_contained", names_to = "literature_id") %>%
group_by(Genes) %>%
filter(is_contained == 1) %>%
summarise(Output = paste0(literature_id, collapse = ", "),
Counts = n()) %>%
right_join(Genes) %>%
mutate(Output = if_else(is.na(Output),
"",
Output),
Counts = if_else(is.na(Counts),
0L,
Counts))
summary_data
# A tibble: 23 x 3
Genes Output Counts
<chr> <chr> <int>
1 Gene_A "21906313, 25229268, 19228761, 26955776, 21126355" 5
2 Gene_B "20706538, 21906313, 25229268, 22633082, 19228761, 26955776, 21126355" 7
3 Gene_C "20706538, 21906313, 25229268, 22633082, 19228761, 26955776, 21126355" 7
4 Gene_D "20706538, 21906313, 22633082, 19228761, 26955776, 21126355" 6
5 Gene_E "" 0
6 Gene_F "20706538, 21906313, 25229268, 22633082, 26955776, 21126355" 6
7 Gene_G "20706538, 21906313, 25229268, 22633082, 19228761, 26955776, 21126355" 7
8 Gene_H "20706538, 21906313, 25229268, 22633082, 26955776, 21126355" 6
9 Gene_I "" 0
10 Gene_J "" 0
# ... with 13 more rows
推荐阅读
- javascript - How to make a semi circle chart in react?
- python - random API latency spikes for cryptocurrency exchanges
- java - 当我在主活动布局中添加片段并添加它的代码时,应用程序崩溃了
- sql - join with lookup and group by
- python - Create a custom function as a filter in proxy model, django admin
- excel - PasteSpecial 给我一个运行时错误 1004
- python - "non unique" sequence by Id
- vba - 我正在尝试使用键盘小键盘中的数字在 ms 访问中创建一个子宏作为自动键宏
- python - 我需要等待 ThreadPoolExecutor 完成吗?
- c# - 如何正确地将 CSV 数据放入 C# 中的自定义类记录中?