r - 基于函数在数据框列表中创建一列
问题描述
这是我的可重现示例:
grange_list <- list(hepg2 = structure(list(seqnames = structure(c(7L, 15L, 1L
), .Label = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6",
"chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14",
"chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21",
"chr22", "chrX"), class = "factor"), start = c(158126281L, 69110138L,
2205071L), end = c(158126380L, 69110237L, 2205170L), width = c(100L,
100L, 100L), strand = structure(c(2L, 2L, 1L), .Label = c("+",
"-", "*"), class = "factor"), name = c("FUS_HepG2_IDR", "FUS_HepG2_IDR",
"FUS_HepG2_IDR"), score = c(1000L, 1000L, 1000L), annotation = c("Intron (uc011kwa.2/5799, intron 2 of 22)",
"Intron (uc002arl.3/8125, intron 1 of 6)", "Intron (uc001aja.4/6497, intron 1 of 6)"
), geneChr = c(7L, 15L, 1L), geneStart = c(157331750L, 69070875L,
2160134L), geneEnd = c(158380482L, 69113261L, 2241652L), geneLength = c(1048733L,
42387L, 81519L), geneStrand = c(2L, 2L, 1L), geneId = c("5799",
"8125", "6497"), distanceToTSS = c(254102, 3024, 44937)), row.names = c(NA,
3L), class = "data.frame"), k562 = structure(list(seqnames = structure(c(10L,
22L, 11L), .Label = c("chr1", "chr2", "chr3", "chr4", "chr5",
"chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13",
"chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20",
"chr21", "chr22", "chrX"), class = "factor"), start = c(72508428L,
49992192L, 3072043L), end = c(72508527L, 49992291L, 3072142L),
width = c(100L, 100L, 100L), strand = structure(c(1L, 2L,
2L), .Label = c("+", "-", "*"), class = "factor"), name = c("FUS_K562_IDR",
"FUS_K562_IDR", "FUS_K562_IDR"), score = c(1000L, 1000L,
1000L), annotation = c("Intron (uc001jrg.3/140766, intron 15 of 21)",
"Intron (uc003biq.3/uc003biq.3, intron 1 of 4)", "Intron (uc001lxe.3/833, intron 1 of 22)"
), geneChr = c(10L, 22L, 11L), geneStart = c(72432559L, 50013290L,
3022152L), geneEnd = c(72522195L, 50051190L, 3078681L), geneLength = c(89637L,
37901L, 56530L), geneStrand = c(1L, 2L, 2L), geneId = c("140766",
"348645", "833"), distanceToTSS = c(75869, 58998, 6539)), row.names = c(NA,
3L), class = "data.frame"), hoel = structure(list(seqnames = structure(c(1L,
1L, 1L), .Label = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6",
"chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14",
"chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21",
"chr22", "chrX", "chrY"), class = "factor"), start = c(557045L,
870107L, 936673L), end = c(557144L, 870206L, 936772L), width = c(100L,
100L, 100L), strand = structure(c(1L, 1L, 1L), .Label = c("+",
"-", "*"), class = "factor"), name = c("FUS", "FUS", "FUS"),
score = c(1000L, 1000L, 1000L), annotation = c("Distal Intergenic",
"Intron (uc001abv.1/148398, intron 4 of 4)", "Distal Intergenic"
), geneChr = c(1L, 1L, 1L), geneStart = c(762971L, 860530L,
948847L), geneEnd = c(794826L, 879961L, 949919L), geneLength = c(31856L,
19432L, 1073L), geneStrand = c(1L, 1L, 1L), geneId = c("643837",
"148398", "9636"), distanceToTSS = c(-205827, 9577, -12075
)), row.names = c(NA, 3L), class = "data.frame"))
这是一个数据框列表,它看起来像这样:
$hepg2
seqnames start end width strand name score annotation geneChr geneStart geneEnd geneLength geneStrand geneId distanceToTSS
1 chr7 158126281 158126380 100 - FUS_HepG2_IDR 1000 Intron (uc011kwa.2/5799, intron 2 of 22) 7 157331750 158380482 1048733 2 5799 254102
2 chr15 69110138 69110237 100 - FUS_HepG2_IDR 1000 Intron (uc002arl.3/8125, intron 1 of 6) 15 69070875 69113261 42387 2 8125 3024
3 chr1 2205071 2205170 100 + FUS_HepG2_IDR 1000 Intron (uc001aja.4/6497, intron 1 of 6) 1 2160134 2241652 81519 1 6497 44937
$k562
seqnames start end width strand name score annotation geneChr geneStart geneEnd geneLength geneStrand geneId distanceToTSS
1 chr10 72508428 72508527 100 + FUS_K562_IDR 1000 Intron (uc001jrg.3/140766, intron 15 of 21) 10 72432559 72522195 89637 1 140766 75869
2 chr22 49992192 49992291 100 - FUS_K562_IDR 1000 Intron (uc003biq.3/uc003biq.3, intron 1 of 4) 22 50013290 50051190 37901 2 348645 58998
3 chr11 3072043 3072142 100 - FUS_K562_IDR 1000 Intron (uc001lxe.3/833, intron 1 of 22) 11 3022152 3078681 56530 2 833 6539
$hoel
seqnames start end width strand name score annotation geneChr geneStart geneEnd geneLength geneStrand geneId distanceToTSS
1 chr1 557045 557144 100 + FUS 1000 Distal Intergenic 1 762971 794826 31856 1 643837 -205827
2 chr1 870107 870206 100 + FUS 1000 Intron (uc001abv.1/148398, intron 4 of 4) 1 860530 879961 19432 1 148398 9577
3 chr1 936673 936772 100 + FUS 1000 Distal Intergenic 1 948847 949919 1073 1 9636 -12075
annotation
我创建了一个在列中查找特定模式的函数:
flag_annot<-function(annotation){
flag = 0
if(length(grep("UTR", annotation, ignore.case = TRUE))){flag = 1}
if(length(grep("Intron",annotation, ignore.case = TRUE))){flag = 1}
if(length(grep("Exon", annotation, ignore.case = TRUE))){flag = 1}
return(flag)
}
目标是创建另一个intragenic
基于annotation
列获取的值调用的列,即1
or 0
。
我知道我可以像这样子集注释列:
lapply(grange_list,'[',,'annotation')
我一直在寻找一个整洁的衬里,也许使用mapply
它可以将该flag_annot
功能与我刚刚在上面所做的那种子集结合起来。谢谢。
解决方案
为了简化,我将更改函数以使用grepl
和使用多个模式,|
而不是单独编写它们。
flag_annot<-function(annotation){
as.integer(grepl('UTR|Intron|Exon', annotation, ignore.case = TRUE))
}
然后lapply
用作 -
lapply(grange_list,function(x) transform(x, intragenic = flag_annot(annotation)))
推荐阅读
- elasticsearch - Elasticsearch 插件上的 kibana 警告
- amazon-web-services - 如何避免删除现有的托管策略?
- d3.js - 在 d3.js 中引用字典值时出错
- swift - 如何:居中 UICollectionViewCells
- laravel - 如何在 Laravel 中实现 GoogleOR-Tool?特别是对于“作为最小成本流算法的分配”(Java)
- ajax - 如何使用 jquery Ajax 将多个输入发送到控制器 spring mvc
- python - 有没有一种方法可以使用键的值来比较两个字典?
- node.js - 如何填充 ObjectIDS 数组的每个条目?
- reactjs - 如果不转到主页,则将应用程序重定向到 404
- angular - 在 *ngIf 中更改变量时组件未加载