首页 > 解决方案 > R使用向量过滤数据框

问题描述

我有一个有 9 列的数据框,我想根据另一个列表过滤行。列表项位于数据框的第 9 列,但同一列中还有其他信息。

数据框:

ST4.03ch01  Cufflinks   mRNA    152322  153489  .   -   .   ID=PGSC0003DMT400039136;Parent=PGSC0003DMG400015133;Source_id=RNASEQ26.809.0;Mapping_depth=16.192011;Class=4;name=Defensin
ST4.03ch01  GLEAN   mRNA    160499  160663  .   -   .   ID=PGSC0003DMT400039133;Parent=PGSC0003DMG400015132;Source_id=PGSC0003DMG000019750;Class=2;name=Defensin J1-2
ST4.03ch01  Cufflinks   mRNA    160379  161885  .   -   .   ID=PGSC0003DMT400039134;Parent=PGSC0003DMG400015132;Source_id=RNASEQ26.803.0;Mapping_depth=35.840147;Class=2;name=Defensin J1-2
ST4.03ch01  Cufflinks   mRNA    225084  229469  .   +   .   ID=PGSC0003DMT400058594;Parent=PGSC0003DMG400022764;Source_id=RNASEQ39.8869.0;Mapping_depth=158.359878;Class=1;name=Glutamate decarboxylase isoform3
ST4.03ch01  Cufflinks   mRNA    248940  249501  .   -   .   ID=PGSC0003DMT400058700;Parent=PGSC0003DMG400022799;Source_id=RNASEQ39.8875.0;Mapping_depth=16.138790;Class=5;name=Gene of unknown function
ST4.03ch09  GLEAN   mRNA    50581315    50583516    .   -   .   ID=PGSC0003DMT400009752;Parent=PGSC0003DMG400003817;Source_id=PGSC0003DMG000004746;Class=2;name=RNA-directed DNA polymerase (Reverse transcriptase); Ribonuclease H
ST4.03ch05  Cufflinks   mRNA    1469122 1469329 .   -   .   ID=PGSC0003DMT400064680;Parent=PGSC0003DMG400025123;Source_id=RNASEQ55.4892.0;Mapping_depth=2.218355;Class=2;name=Class S F-box protein

向量:

 [1] "PGSC0003DMT400035774"                     "PGSC0003DMT400086293"                 "PGSC0003DMT400086672" "PGSC0003DMT400095645" "PGSC0003DMT400091693" "PGSC0003DMT400080908"
 [7] "PGSC0003DMT400087868" "PGSC0003DMT400086807" "PGSC0003DMT400085319" "PGSC0003DMT400088916" "PGSC0003DMT400076675" "PGSC0003DMT400092517"
[13] "PGSC0003DMT400009752" "PGSC0003DMT400084819" "PGSC0003DMT400092699" "PGSC0003DMT400085283" "PGSC0003DMT400086058" "PGSC0003DMT400064680"

列表项与 ID=........ 部分匹配。我理想的结果是拥有name=.... 相当于列表项。

理想输出:

PGSC0003DMT400009752    RNA-directed DNA polymerase (Reverse transcriptase); Ribonuclease H
PGSC0003DMT400064680    Class S F-box protein

有任何想法吗?

谢谢你

标签: rlistdataframefilter

解决方案


这是一个选项str_detectstr_extract

library(dplyr)
library(stringr)
df1 %>%
   filter(str_detect(v9, str_c("\\b(", str_c(vec1, collapse="|"), 
      ")\\b"))) %>% 
   transmute(ID = str_extract(v9, "(?<=ID\\=)\\w+"), 
       details = str_extract(v9, "(?<=name\\=).*"))

-输出

#          ID                                                             details
#1 PGSC0003DMT400009752 RNA-directed DNA polymerase (Reverse transcriptase); Ribonuclease H
#2 PGSC0003DMT400064680                                               Class S F-box protein

数据

df1 <- structure(list(v1 = c("ST4.03ch01", "ST4.03ch01", "ST4.03ch01", 
"ST4.03ch01", "ST4.03ch01", "ST4.03ch09", "ST4.03ch05"), v2 = c("Cufflinks", 
"GLEAN", "Cufflinks", "Cufflinks", "Cufflinks", "GLEAN", "Cufflinks"
), v3 = c("mRNA", "mRNA", "mRNA", "mRNA", "mRNA", "mRNA", "mRNA"
), v4 = c(152322L, 160499L, 160379L, 225084L, 248940L, 50581315L, 
1469122L), v5 = c(153489L, 160663L, 161885L, 229469L, 249501L, 
50583516L, 1469329L), v6 = c(".", ".", ".", ".", ".", ".", "."
), v7 = c("-", "-", "-", "+", "-", "-", "-"), v8 = c(".", ".", 
".", ".", ".", ".", "."), v9 = c("ID=PGSC0003DMT400039136;Parent=PGSC0003DMG400015133;Source_id=RNASEQ26.809.0;Mapping_depth=16.192011;Class=4;name=Defensin", 
"ID=PGSC0003DMT400039133;Parent=PGSC0003DMG400015132;Source_id=PGSC0003DMG000019750;Class=2;name=Defensin J1-2", 
"ID=PGSC0003DMT400039134;Parent=PGSC0003DMG400015132;Source_id=RNASEQ26.803.0;Mapping_depth=35.840147;Class=2;name=Defensin J1-2", 
"ID=PGSC0003DMT400058594;Parent=PGSC0003DMG400022764;Source_id=RNASEQ39.8869.0;Mapping_depth=158.359878;Class=1;name=Glutamate decarboxylase isoform3", 
"ID=PGSC0003DMT400058700;Parent=PGSC0003DMG400022799;Source_id=RNASEQ39.8875.0;Mapping_depth=16.138790;Class=5;name=Gene of unknown function", 
"ID=PGSC0003DMT400009752;Parent=PGSC0003DMG400003817;Source_id=PGSC0003DMG000004746;Class=2;name=RNA-directed DNA polymerase (Reverse transcriptase); Ribonuclease H", 
"ID=PGSC0003DMT400064680;Parent=PGSC0003DMG400025123;Source_id=RNASEQ55.4892.0;Mapping_depth=2.218355;Class=2;name=Class S F-box protein"
)), class = "data.frame", row.names = c(NA, -7L))

vec1 <- c("PGSC0003DMT400009752", "PGSC0003DMT400009752", "PGSC0003DMT400084819", 
"PGSC0003DMT400092699", "PGSC0003DMT400085283", "PGSC0003DMT400086058", 
"PGSC0003DMT400064680")

推荐阅读