首页 > 解决方案 > 如何基于另一个 .csv 文件删除 .csv 文件的元素?

问题描述

我正在编写一个用于在文本中查找单词的脚本(使用 R 语言)。现在我只能得到一个.csv(abstract_atomized.csv)单词列表,根据它们的频率进行原子化和分类。我有另一个 .csv 列表,其中包含一组常见和无用的英语单词 (wordlistenglish.csv),我想从我的第一个列表中删除它,以便只保留相关元素。我不知道如何在 R 上有效地做到这一点。你能帮我吗?

谢谢。

    library(pubmed.mineR)
    library(scholar)
    library(tools)
    library(stringr)
    abstractR <- readabs(abstract)
    atomized_text <- word_atomizations(abstractR)
    file_without_ext <- file_path_sans_ext(abstract)
    atomized_file_name <- paste0(file_without_ext, '_atomized.csv')
    write.csv(atomized_text, atomized_file_name)
    output_text <- paste0('Most used words described in ',atomized_file_name,' take the time to read them and to select the relevent key words')
    print(output_text)
abstract_atom<- read.csv('abstract_atomized.csv')
wordlist<- read.csv('wordlistenglish.csv')
abstract_atom[!(abstract_atom$words %in% wordlist$words),]
write.csv(abstract_atom, file='abstract_atom.csv')

更新

数据结构(我不能放整个结构,因为字符太多:第一个列表大约 6000 个单词,第二个列表大约 1000 个单词)

原子表结构:

    words = structure(c(2772L, 4003L, 737L, 2371L, 3797L, 4988L
    ), .Label = c("-29", "-325", "-328", "-337", "-59", "-dependent", 
    "-dichlorophenyl)-1", "-disulfonic", "-induced", "-maleimidyldistilbene-2", 
    "-sh", "-so(3)(-))", "\"giacomo", "\"paradox", "(-323", "(-335", 
    "(-s-)", "(-so(-))", "(-so(2)(-)", "(#)contributed", "(1)department", 
    "(1)institut", "(1)instituto", "(1)laboratoire", "(1)laboratory", 
    "(1)plant", "(1)unité", "(10)laboratory", "(2-cys", "(2)bio-pharmaceutical", 
    "(2)department", "(2)sorbonne", "(2)université", "(219)cgpc(222)", 
    "(28)wcsys(32)", "(3)institute", "(3)laboratoire", "(3)laboratory", 
    "(3)plant", "(3d)", "(4)laboratoire", "(4)laboratory", "(4)sorbonne", 
    "(5)bio-pharmaceutical", "(5)department", "(6)laboratoire", 
    "(6)laboratory", "(6)plant", "(7)laboratoire", "(7)laboratory", 
    "(7)spemann", "(8)laboratory", "(8)université", "(9)laboratoire", 
    "(a(4)", "(a(4)-gapdh)", "(and", "(arabidopsis", "(aromatic", 
    "(atprk)", "(b-containing", "(bio-phase)", "(biogssg)", "(bioss)", 
    "(c86)", "(cb)", "(cr)", "(crpgk1)", "(crprk)", "(crtk)", 
    "(crtkapo)", "(cx(2)c)", "(cys(149)-ssg)", "(cys(29))", "(cys(87))", 
    "(dcmu)", "(deduced", "(diamide", "(e", "(e(m))", "(eda)", 
    "(eeg)", "(er)", "(fd)", "(for", "(frias)", "(ftr)", "(gapc1", 
    "(gapdh)", "(glyceraldehyde-3-phosphate", "(glycine", "(gpxs)", 
    "(grx)", "(grxs)", "(gsh)", "(h2o2)", "(heat-shock", "(hsp70", 
    "(i", "(icl)", "(inra)", "(isoform", "(lhcii)", "(metso)", 
    "(mms)", "(msrs)", "(multifunctional)", "(nadp-mdh)", "(negative", 
    "(no)", "(o2*)", "(pdi)", "(pgk1)", "(phaseolus", "(pk(a)=5", 
    "(pm)", "(populus", "(prk)", "(prxii)", "(prxs)", "(ptm)", 
    "(real", "(reduced", "(rns)", "(ros)", "(sll1621)", "(sll1908)", 
    "(slr1562", "(slr1849)", "(sno)", "(sorghum", "(spinacia", 
    "(ss)", "(ssg)", "(tk)", "(tpi)", "(tpp)", "(trx", "(trx)", 
    "(trx)-dependent", "(trxf)", "(trxh1)", "(trxh2)", "(trxs)", 
    "(upmc)", "(β/α)8-barrel", "[(35)s]cysteine", "[4fe-4s]", 
    "[fe2s2]", "[gsh]/[gsno]", "[gsh]/[gssg]", "&", "+/-", "+300", 
    "+80", "<2-fold", "~10-fold", "~20-fold", "~6-fold", "06", 
    "1-cys", "1-dimethylurea", "10 μm", "1052", "11", "110", 
    "1136", "1188", "119", "12", "125", "1278", "13", "133", 
    "1417", "16", "18", "190", "2-cys", "2-cys-peroxiredoxin", 
    "2)", "200", "225", "24", "25", "26", "29", "2nd", "3-(3", 
    "3-bisphosphoglycerate", "3-phosphoglycerate", "30", "33", 
    "381", "383", "392", "3d", "3d-structure", "3rd", "4-acetamido-4", 
    "40126", "41092", "42", "492", "54506", "55", "56", "561", 
    "7)", "70 kda)", "70803", "75005", "79104", "81", "8226", 
    "86", "8600", "8618", "9)", "90", "90095", "91405", "94720-3102", 
    "98%", "å", "a(2)b(2)", "a(4)-gapdh", "a(4)-glyceraldehyde-3-phosphate", 
    "a(8)b(8)", "a(n)b(n)-gapdh", "aa", "ab", "åb", "abdelmohsen", 
    "abe", "abedin", "abeliovich", "ability", "abiotic", "able", 
    "abnormally", "absence", [...] "zhong", "zhou", "zhu", "zhuang", 
    "zimmer", "zio", "ziparo", "zj", "zm", "zn", "zois", "zoladek", 
    "zong", "zorzano", "zughaier", "zw", "zx", "β-sheet"), class = "factor"), 
    Freq = c(161L, 150L, 114L, 98L, 90L, 79L)), row.names = c(NA, 
6L), class = "data.frame")

词表结构

structure(list(the = structure(c(8762L, 9971L, 9929L, 9917L, 
9628L, 9437L), .Label = c("a", "aa", "aaa", "aaron", "ab", "abandoned", 
"abc", "aberdeen", "abilities", "ability", "able", "aboriginal", 
"abortion", "about", "above" [...] "yukon", "z", "za", "zambia", "zdnet", "zealand", "zen", "zero", 
"zimbabwe", "zinc", "zip", "zoloft", "zone", "zones", "zoning", 
"zoo", "zoom", "zoophilia", "zope", "zshops", "zu", "zum", "zus"
), class = "factor")), row.names = c(NA, 6L), class = "data.frame")

标签: rcsv

解决方案


这是您应该使用的通用格式,适合您的实际数据结构:

filtered_list<- list1[!(list1$words %in% list2$words),]

list1它保留单词未出现在其中的所有行list2。如果您想使用 tidyverse(速度较慢,但​​通常更方便编写代码)来执行此操作,它将如下所示:

require(dplyr)
filtered_list<- list1 %>% filter(!(words %in% list2$words))

推荐阅读