r - 保存形成列表对象的 tidypmc 输出并根据 PMCID 将其保存到单个文件中
问题描述
所以我做一个查询它返回我 PMCIDs 再次用于查询使用 tidypmc libray 来解析包含来自各种论文的元数据的表,最终作为列表返回。一些 PMCIDs 将是空的,因为它没有适当的表标签等。所以现在我想将每个 PMCID 保存到单独的文件中,我试过但我得到了一个错误,如果我得到它就不是那么简单了。由于在每个 PMCID 列表下都有多个表,这些表也应该需要保存在该 PMCID 下。
不知道如何进行,但我认为的一种方法是,如果 PMCID 包含 4 个表,然后在该接收 PMCID 文件夹下包含 4 个表,则每个 PMCID 结果应该写入单个文件夹中。
下面是我正在使用的代码
library("europepmc")
library(xml2)
library(tidypmc)
b <-epmc_search(query = 'acute myeloid leukemia drug studies',output = 'parsed',limit = 20)
a <- b %>% select(pmid,pmcid)
a <- a[complete.cases(a),]
c <- a$pmcid
pub_tables <- lapply(c, function(pmc_id) {
message("-- Trying ", pmc_id, "...")
doc <- tryCatch(pmc_xml(pmc_id),
error = function(e) {
message("------ Failed to recover PMCID")
return(NULL)
})
if(!is.null(doc)) {
#-- If succeed, try to get table
tables <- pmc_table(doc)
if(!is.null(tables)) {
#-- If succeed, try to get table name
table_caps <- pmc_caption(doc) %>%
filter(tag == "table")
#names(tables) <- paste(table_caps$label, table_caps$text, sep = " - ")
}
return(tables)
} else {
#-- If fail, return NA
return(NA)
}
Sys.sleep(sample(1:10))
})
names(pub_tables) <- c
for (i in 1:length(pub_tables)) {
write.csv(pub_tables[i], file=paste0("output/", names(pub_tables)[i], ".txt"))
}
错误(函数(...,row.names = NULL,check.rows = FALSE,check.names = TRUE,:参数暗示不同的行数:28、8、20
我将dput
我尝试使用 20 的示例查询,以便对象很小
dput(pub_tables)
list(PMC6968541 = NULL, PMC7170320 = NULL, PMC7269076 = NULL,
PMC7219522 = NULL, PMC7372828 = list(`Table 1` = structure(list(
X1 = c("AML with recurrent genetic abnormalities", "AML with t(8;21)(q22;q22.1);RUNX1-RUNX1T1",
"AML with inv. (16)(p13.1q22) or t(16;16)(p13.1;q22);CBFB-MYH11",
"APL with PML-RARA", "AML with t(9;11)(p21.3;q23.3);MLLT3-KMT2A",
"AML with t(6;9)(p23;q34.1);DEK-NUP214", "AML with inv. (3)(q21.3q26.2) or t(3;3)(q21.3;q26.2); GATA2, MECOM",
"AML (megakaryoblastic) with t(1;22)(p13.3;q13.3);RBM15-MKL1",
"Provisional entity: AML with BCR-ABL1", "AML with mutated NPM1",
"AML with biallelic mutations of CEBPA", "Provisional entity: AML with mutated RUNX1",
"AML with myelodysplasia-related changes", "Therapy-related myeloid neoplasms",
"AML, NOS", "AML with minimal differentiation", "AML without maturation",
"AML with maturation", "Acute myelomonocytic leukemia",
"Acute monoblastic/monocytic leukemia", "Pure erythroid leukemia",
"Acute megakaryoblastic leukemia", "Acute basophilic leukemia",
"Acute panmyelosis with myelofibrosis", "Myeloid sarcoma",
"Myeloid proliferations related to Down syndrome", "Transient abnormal myelopoiesis (TAM)",
"Myeloid leukemia associated with Down syndrome"), X2 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_)), row.names = c(NA,
-28L), class = c("tbl_df", "tbl", "data.frame"), caption = "The 2016 WHO classification of acute myeloid leukemia (AML) and related neoplasms", footnotes = "APL, acute promyelocytic leukemia; NOS, not otherwise specified"),
`Table 2` = structure(list(`Functional category` = c("Myeloid transcription-factor genes",
"Nucleophosmin (NPM1) gene", "Tumor suppressor genes",
"Signaling genes", "DNA methylation", "Chromatin modifier",
"Cohesin complex", "Splicing factors"), `Gene members` = c("Transcription factor fusions by chromosomal rearrangements, such as t(8;21)(q22;q22); RUNX1-RUNX1T1 and inv(16)(p13.1q22) or t(16;16)(p13.1;q22); CBFB-MYH11GATA2, RUNX1 and CEBPA",
"NPM1", "TP53, WT1, PHF6", "FLT3, KIT, PTPN11, RAS",
"DNMT3A, TET2, IDH1, IDH2", "ASXL1, EZH2 and KMT2A",
"STAG1, STAG2, RAD21, SMC1A, SMC3,", "SRSF2, SF3B1, U2AF1, ZRSR2"
), `Role in AML Leukemogenesis` = c("Transcriptional deregulation and impaired hematopoietic differentiation.",
"Aberrant cytoplasmic localization of NPM1 and its interacting proteins",
"Transcriptional deregulation and impaired degradation via the negative regulator (MDM2 and PTEN oncogenes)",
"Proliferative advantage through the RAS-RAF, JAK-STAT, and PI3K-AKT signaling pathways",
"Deregulation of DNA methylation and oncometabolite production",
"Deregulation of chromatin modification and impairment of methyltransferases function",
"Impairment of accurate chromosome segregation and transcriptional regulation",
"Deregulated RNA processing and aberrant splicing patterns"
)), row.names = c(NA, -8L), class = c("tbl_df", "tbl",
"data.frame"), caption = "Functional categories of genes that are commonly mutated in acute myeloid leukemia (AML)"),
`Table 3` = structure(list(`Risk profiles` = c("Favorable",
"Favorable", "Favorable", "Favorable", "Favorable", "Intermediate",
"Intermediate", "Intermediate", "Intermediate", "Intermediate",
"Adverse", "Adverse", "Adverse", "Adverse", "Adverse",
"Adverse", "Adverse", "Adverse", "Adverse", "Adverse"
), Subgroups = c("t(8;21)(q22;q22.1); RUNX1-RUNX1T1",
"inv (16)(p13.1q22) or t(16;16)(p13.1;q22); CBFB-MYH11",
"Mutated NPM1 without FLT3-ITD", "Mutated NPM1 with FLT3-ITDlow",
"Biallelic mutated CEBPA", "Mutated NPM1 and FLT3-ITDhigh",
"Wild-type NPM1 without FLT3-ITD", "Wild-type NPM1 with FLT3-ITDlow",
"t(9;11)(p21.3;q23.3); MLLT3-KMT2A", "Cytogenetic abnormalities not classified",
"t(6;9)(p23;q34.1); DEK-NUP214", "t(v;11q23.3); KMT2A rearranged",
"t(9;22)(q34.1;q11.2); BCR-ABL1", "inv (3)(q21.3q26.2) or t(3;3)(q21.3;q26.2); GATA2,MECOM(EVI1)",
"Complex karyotype, monosomal karyotype", "-5 or del(5q); −7; −17/abn(17p)",
"Wild-type NPM1 and FLT3-ITDhigh", "Mutated RUNX1", "Mutated ASXL1",
"Mutated TP53")), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"), caption = "Risk stratification of AML according To 2017 ELN recommendations [24]", footnotes = "Low, low allelic ratio (< 0.5); high, high allelic ratio (≥0.5)")),
PMC7374966 = list(`Table 1` = structure(list(`Year of publication, region/country (reference)` = c("1970, West Virginia (USA)[7, 10]",
"1983, Thailand[5]", "1990, Texas (USA)[8]", "1992, Mississippi (USA)[12]",
"1994, Maryland (USA)[13]", "2009, India[11]", "2010, Germany[14]",
"2011, Japan[9]", "2018, Wisconsin (USA)[6]", "2019, Switzerland(present case)"
), `Underlying conditions` = c("1 year-old male, no underlying conditions",
"20 year-old female, no underlying conditions", "29 year-old male, cocaine abuse",
"64 year-old male, kidney transplantation", "32 year-old female, lymphocytic lymphoma with leukemic transformation (neutropenia)",
"10 year-old female, T-cell acute lymphoblastic leukemia",
"78 year-old female, myelodysplastic syndrome", "61 year-old male, mantle cell lymphoma, allogeneic HSCT",
"15 year-old male, B-cell lymphoblastic leukemia (neutropenia)",
"71 year-old, acute myeloid leukemia (neutropenia)"), `Organs affected` = c("Mediastinum, lungs, pericardium",
"Soft tissues (breast), lungs, mediastinum, liver, gastro-intestinal tract",
"Endocardium, blood, skin, heart, lungs, kidneys, brain, muscles",
"Lungs, myocardium, brain, kidney, thyroid", "Lungs, pericardium",
"Sinus, soft tissues (facial)", "Sinus, soft tissues (facial), brain",
"Lungs, heart, spleen, kidney, bladder, thyroid", "Sinus, lungs",
"Lungs"), Species = c("C. incongruus", "C. incongruus", "Conidiobolus spp.",
"C. coronatus", "C. incongruus", "C. coronatus", "C. incongruus",
"C. lamprauges", "C. coronatus", "Conidiobolus spp."), `Treatment (dose), duration and outcome` = c("Deoxycholate amphotericin B (1 mg/kg/day), 10 weeksOutcome: cure",
"Co-trimoxazole (2 g/day), duration NSOutcome: death",
"NoneOutcome: death", "Deoxycholate amphotericin B (50 mg every other day), until deathOutcome: death",
"Deoxycholate amphotericin B (0.5 mg/kg/day, then 1.5 mg/kg/day) and flucytosine (150 mg/kg/day), until deathSurgeryOutcome: death",
"Amphotericin B (NS), until deathSurgeryOutcome: death",
"Liposomal amphotericin B (200 mg/day), until deathSurgeryOutcome: death",
"Micafungin (150 mg/day) and liposomal amphotericin B (2.5 mg/kg/day), then intravenous voriconazole (6 mg/kg/day on day 1, then 4 mg/kg/day) and micafungin (150 mg/day), until deathOutcome: death",
"Liposomal amphotericin B (10 mg/kg/day) and anidulafungin (1.5 mg/kg/day) and oral terbinafine (250 mg twice per day), duration NSSurgery, granulocyte transfusionOutcome: cure",
"Caspofungin (70 mg/day on day 1, then 50 mg/day), then liposomal amphotericin B (5 mg/kg/day), then oral isavuconazole (200 mg three times per day on day 1 and 2, then 200 mg/day), 2 monthsSurgeryOutcome: cure"
)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"
), caption = "Case reports of invasive fungal infections due to Conidiobolus spp.", footnotes = "NS Not specified")))
任何建议或帮助将不胜感激。
解决方案
您需要按 Open Access 过滤搜索(或按 isOpenAccess 列过滤结果)
library(europepmc)
b <-epmc_search(query = 'acute myeloid leukemia drug studies OPEN_ACCESS:Y',limit = 20)
pmcids <- b$pmcid[b$isOpenAccess=="Y"]
然后我会遍历 PMC id 并保存文本和表格
library(tidypmc)
n <- length(pmcids)
txt <- vector("list", n)
tbl <- vector("list", n)
names(txt) <- pmcids
names(tbl) <- pmcids
for(i in 1:n){
id <- pmcids[i]
message("Parsing ", i, ". ", id)
doc <- pmc_xml(id)
txt[[i]] <- pmc_text(doc)
## pmc_table returns NULL if missing, which will delete the element!
x <- pmc_table(doc)
if(!is.null(x)) tbl[[i]] <- x
Sys.sleep(sample(1:3))
}
最后,将表格折叠成列名和单元格值对。
library(tidyverse)
txt2 <- bind_rows(txt, .id="PMCID")
tbl2 <- bind_rows( lapply(tbl, collapse_rows), .id="PMCID")
标题和脚注被保存为属性,所以你也可以得到它们(咕噜专家可能会更好地格式化它)
attributes(tbl[[5]][[1]])
# $caption
# [1] "The 2016 WHO classification of acute myeloid leukemia (AML) and related neoplasms"
# $footnotes
# [1] "APL, acute promyelocytic leukemia; NOS, not otherwise specified"
enframe( unlist( lapply(tbl, sapply, attr, "caption")))
# name value
# <chr> <chr>
# 1 PMC7372828.Table 1 The 2016 WHO classification of acute myeloid leukemia (AML) and related neoplasms
# 2 PMC7372828.Table 2 Functional categories of genes that are commonly mutated in acute myeloid leukemia (AML)
# 3 PMC7372828.Table 3 Risk stratification of AML according To 2017 ELN recommendations [24]
# 4 PMC7374966.Table 1 Case reports of invasive fungal infections due to Conidiobolus spp.
# 5 PMC7362563.Table 1 Best overall response for patients with AML at any time on treatment
推荐阅读
- for-loop - 对于循环问题。让 n = 0; 让 x = 0; 而 (n < 3) { n++; x += n; 为什么 X 值会改变?
- java - 在 Java API 上使用 JMeter 进行低环境性能测试需要多长时间
- java - 在java中删除LinkedList中的节点
- javascript - 有人可以将此 HTML 与 CSS 和 JavaScript 结合起来吗
- html - 如何更改选择选项背景颜色
- mysql - 如何根据另一个表中列的值为一个表编写触发器?
- javascript - 从另一个来源获取的内联脚本执行代码
- mysql - 如果该行具有超过 1 个相同的值/重复,则从选择中排除行
- elasticsearch - Elasticsearch 中所有匹配用户的角色映射
- c# - 将对象列表从视图传递到控制器