首页 > 解决方案 > 如何在特定位置拆分 .txt 文件并将每个部分作为表格导入 r

问题描述

我有包含化学品记录的巨大文本文件。每个条目都以“*NEWRECORD”开头并以空行结尾。我不知道有多少记录。每个记录的行不同。如何将每条记录保存为单独的数据框?

下面是我的文本文件的示例

imported_data <- c("*NEWRECORD
   SH = diagnostic imaging
   QE = DIAG IMAGE
   QA = DG
   QT = 1
   QX = X-ray|NRW
   UI = Q000000981

   *NEWRECORD
   RECTYPE = Q
   SH = analogs & derivatives
   QE = ANALOGS
   QA = AA
   QT = 1

   *NEWRECORD
   RECTYPE = Q
   SH = abnormalities
   QE = ABNORM
   QX = agenesis|NRW
   QX = anomalies|EQV
   QX = aplasia|NRW
   QX = atresia|NRW
   QX = birth defects|NRW
   QX = congenital defects|NRW
   QX = defects|NRW
   QX = deformities|NRW
   QX = hypoplasia|NRW
   UI = Q000002")

# What I expect is

# Table_1
# SH                    QE         QA   QT   QX         UI
# diagnostic imaging   DIAG IMAGE  DG   1    X-ray|NRW  Q000000981

# Table_2
# RECTYPE   SH                     QE        QA     QT
# Q         analogs & derivatives  ANALOGS   AA     1

# and so on ...

标签: rsplit

解决方案


这也许是解决方案的一些开始:

library(tidyverse)

imported_data %>% str_split("\\*NEWRECORD") -> l
l[[1]][-1] %>%
  purrr::map(
    function(x) data.frame(z=str_split(x,"\n")[[1]][-1]) %>%
                filter(str_detect(z,"="))
  ) %>%
  purrr::map(
    function(x) separate(x,z,c("k","v")," = ",extra="merge") %>%
                mutate(k=str_replace_all(k," ",""))
  )

#[[1]]
#   k                  v
#1 SH diagnostic imaging
#2 QE         DIAG IMAGE
#3 QA                 DG
#4 QT                  1
#5 QX          X-ray|NRW
#6 UI         Q000000981

#[[2]]
#        k                     v
#1 RECTYPE                     Q
#2      SH analogs & derivatives
#3      QE               ANALOGS
#4      QA                    AA
#5      QT                     1

#[[3]]
#         k                      v
#1  RECTYPE                      Q
#2       SH          abnormalities
#3       QE                 ABNORM
#4       QX           agenesis|NRW
#5       QX          anomalies|EQV
#6       QX            aplasia|NRW
#7       QX            atresia|NRW
#8       QX      birth defects|NRW
#9       QX congenital defects|NRW
#10      QX            defects|NRW
#11      QX        deformities|NRW
#12      QX         hypoplasia|NRW
#13      UI                Q000002

要从所有这些中仅获取一个数据框,可以选择:

imported_data %>% 
  str_split("\\*NEWRECORD") -> l
l[[1]][-1] %>%
   purrr::map(function(x) data.frame(z=str_split(x,"\n")[[1]][-1]) %>%
                          filter(str_detect(z,"="))) %>%
   purrr::map(function(x) separate(x,z,c("k","v")," = ",extra="merge") %>%
                          mutate(k=str_replace_all(k," ","")) %>%
                          group_by(k) %>%
                          summarise(v= paste(v,collapse=", ")) %>%
                          spread(k,v)
   ) %>% purrr::reduce(bind_rows)
## A tibble: 3 x 7
#  QA    QE         QT    QX                                                                                                                                        SH                   UI        RECTYPE
#  <chr> <chr>      <chr> <chr>                                                                                                                                 <chr>                <chr>     <chr>  
#1 DG    DIAG IMAGE 1     X-ray|NRW                                                                                                                             diagnostic imaging   Q0000009~ <NA>   
#2 AA    ANALOGS    1     <NA>                                                                                                                                  analogs & derivativ~ <NA>      Q      
#3 <NA>  ABNORM     <NA>  agenesis|NRW, anomalies|EQV, aplasia|NRW, atresia|NRW, birth defects|NRW, congenital defects|NRW, defects|NRW, deformities|NRW, hypo~ abnormalities        Q000002   Q

推荐阅读