r - R导入半非结构化数据CSV
问题描述
我习惯于将带有列标题和单个表的直接 csv 加载到 R 中,我有一个具有以下结构的大型 csv 文件:
+-----------+---------+--------+---------+--------+---------+
| file_name | | | | | |
+-----------+---------+--------+---------+--------+---------+
| table1 | | | | | |
+-----------+---------+--------+---------+--------+---------+
| Var1 | Var2 | Var3 | Var4 | Var5 | Var6 |
+-----------+---------+--------+---------+--------+---------+
| 198824 | 198824 | 198824 | 198824 | 198824 | 198824 |
+-----------+---------+--------+---------+--------+---------+
| 123 | 1234 | 1242 | 124 | 1241 | 1232 |
+-----------+---------+--------+---------+--------+---------+
| | | | | | |
+-----------+---------+--------+---------+--------+---------+
| | | | | | |
+-----------+---------+--------+---------+--------+---------+
| file_name | | | | | |
+-----------+---------+--------+---------+--------+---------+
| table2 | | | | | |
+-----------+---------+--------+---------+--------+---------+
| Var1 | Var2 | Var3 | Var4 | Var5 | Var6 |
+-----------+---------+--------+---------+--------+---------+
| x | x | x | x | x | x |
+-----------+---------+--------+---------+--------+---------+
| y | y | y | y | y | y |
+-----------+---------+--------+---------+--------+---------+
| z | z | z | z | z | z |
+-----------+---------+--------+---------+--------+---------+
| | | | | | |
+-----------+---------+--------+---------+--------+---------+
| | | | | | |
+-----------+---------+--------+---------+--------+---------+
| file_name | | | | | |
+-----------+---------+--------+---------+--------+---------+
| table3 | | | | | |
+-----------+---------+--------+---------+--------+---------+
| Var1 | Var2 | Var3 | Var4 | Var5 | Var6 |
+-----------+---------+--------+---------+--------+---------+
| 532523 | 25235 | 532523 | 25235 | 532523 | 25235 |
+-----------+---------+--------+---------+--------+---------+
| 25332 | 5325235 | 25332 | 5325235 | 25332 | 5325235 |
+-----------+---------+--------+---------+--------+---------+
数据并非完全非结构化,因为它遵循以下模式:
第一行只有文件名:file_name
第二行有表:table1、table2、table3 等。
以及实际的表本身,即来自 var1 的 6 列到 var6 及其下方的数据。
然后有 2 个空行,下一组将以 file_name 重复开始,然后是下一个表号和其中的表
CSV 中的所有表都遵循此模式,但我什至无法将其加载到 R 中,使用 read.csv() 直接加载时我得到以下信息:
Error in read.table(file = file, header = header, sep = sep, quote = quote, :
more columns than column names
这是否可以使用 R 加载到一个数据框中,并且表号成为一列,var1-var6 + 表号作为列标题?
IE
+--------+---------+--------+---------+--------+---------+--------------+
| Var1 | Var2 | Var3 | Var4 | Var5 | Var6 | table_number |
+--------+---------+--------+---------+--------+---------+--------------+
| 198824 | 198824 | 198824 | 198824 | 198824 | 198824 | table1 |
| 123 | 1234 | 1242 | 124 | 1241 | 1232 | table1 |
| x | x | x | x | x | x | table2 |
| y | y | y | y | y | y | table2 |
| z | z | z | z | z | z | table2 |
| 532523 | 25235 | 532523 | 25235 | 532523 | 25235 | table3 |
| 25332 | 5325235 | 25332 | 5325235 | 25332 | 5325235 | table3 |
+--------+---------+--------+---------+--------+---------+--------------+
请注意,每个表(table1、table2 等)的行数具有不同的行数。
CSV 文件总共有大约 200 个表,超过了 Excel 限制(我认为大约 9MM 行)
使用 Brian 的代码,这里是前几行:
> lines_all
[1] "name,,,,," "table1,,,,," "Var1,Var2,Var3,Var4,Var5,Var6" "321,54312,321,54654,3564,54321"
[5] "45,54,4564,54,87,456" ",,,,," ",,,,," "name,,,,,"
[9] "table2,,,,," "Var1,Var2,Var3,Var4,Var5,Var6" "ssvf,afs,fasf,afsaf,zxvz,zvx" "saf,zvx,zz,z,zxvz,zxvzxv"
[13] "zxvsaf,wr,wrw,afsaf,asf,af" ",,,,," ",,,,," "name,,,,,"
[17] "table3,,,,," "Var1,Var2,Var3,Var4,Var5,Var6" "1,2,3,4,5,6" "7,8,9,10,11,12"
[21] "13,14,15,16,17,18" "19,20,21,22,23,24"
解决方案
使用这个文件:
file_name
table1
Var1, Var2, Var3, Var4, Var5, Var6
198824, 198824, 198824, 198824, 198824, 198824
123, 1234, 1242, 124, 1241, 1232
file_name
table2
Var1, Var2, Var3, Var4, Var5, Var6
x, x, x, x, x, x
y, y, y, y, y, y
z, z, z, z, z, z
file_name
table3
Var1, Var2, Var3, Var4, Var5, Var6
532523, 25235, 532523, 25235, 532523, 25235
25332, 5325235, 25332, 5325235, 25332, 5325235
首先将所有内容作为字符向量读取。
library(readr)
library(stringr)
library(purrr)
library(dplyr)
# Could be done in base R, but {readr} will be faster on a large file
# read in all lines
lines_all <- read_lines("nested_tables.txt")
lines_all
#> [1] "file_name"
#> [2] "table1"
#> [3] "Var1, Var2, Var3, Var4, Var5, Var6"
#> [4] "198824, 198824, 198824, 198824, 198824, 198824"
#> [5] "123, 1234, 1242, 124, 1241, 1232"
#> [6] ""
#> [7] ""
#> [8] ""
#> [9] ""
#> [10] ""
#> [11] "file_name"
#> [12] "table2"
#> [13] "Var1, Var2, Var3, Var4, Var5, Var6"
#> [14] "x, x, x, x, x, x"
#> [15] "y, y, y, y, y, y"
#> [16] "z, z, z, z, z, z"
#> [17] ""
#> [18] ""
#> [19] ""
#> [20] ""
#> [21] ""
#> [22] "file_name"
#> [23] "table3"
#> [24] "Var1, Var2, Var3, Var4, Var5, Var6"
#> [25] "532523, 25235, 532523, 25235, 532523, 25235"
#> [26] "25332, 5325235, 25332, 5325235, 25332, 5325235"
在每个字符串中搜索与您的表名匹配的正则表达式。您可能需要调整匹配模式:"table[0-9]"
以匹配您的实际姓名。
# find where there's a string like "table1"
table_id_indices <- str_detect(lines_all, "table[0-9]")
table_id_indices
#> [1] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#> [12] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#> [23] TRUE FALSE FALSE FALSE
# extract the table names in order
table_id_names <- lines_all[table_id_indices]
table_id_names
#> [1] "table1" "table2" "table3"
现在您已经有了一个字符串向量以及每个 ID 开始的索引,您可以将向量拆分。
# split the vector of lines into a list of vectors
# `cumsum` is a handy trick to "fill" from one TRUE value to the next
lines_chunked <- split(lines_all, cumsum(table_id_indices))
lines_chunked
#> $`0`
#> [1] "file_name"
#>
#> $`1`
#> [1] "table1"
#> [2] "Var1, Var2, Var3, Var4, Var5, Var6"
#> [3] "198824, 198824, 198824, 198824, 198824, 198824"
#> [4] "123, 1234, 1242, 124, 1241, 1232"
#> [5] ""
#> [6] ""
#> [7] ""
#> [8] ""
#> [9] ""
#> [10] "file_name"
#>
#> $`2`
#> [1] "table2"
#> [2] "Var1, Var2, Var3, Var4, Var5, Var6"
#> [3] "x, x, x, x, x, x"
#> [4] "y, y, y, y, y, y"
#> [5] "z, z, z, z, z, z"
#> [6] ""
#> [7] ""
#> [8] ""
#> [9] ""
#> [10] ""
#> [11] "file_name"
#>
#> $`3`
#> [1] "table3"
#> [2] "Var1, Var2, Var3, Var4, Var5, Var6"
#> [3] "532523, 25235, 532523, 25235, 532523, 25235"
#> [4] "25332, 5325235, 25332, 5325235, 25332, 5325235"
要使这些行可读,请删除所有非表格行。
# remove lines that don't have commas, since they're not tables
lines_chunked_cleaned <- map(lines_chunked, ~str_subset(.x, ",")) %>% compact()
lines_chunked_cleaned
#> $`1`
#> [1] "Var1, Var2, Var3, Var4, Var5, Var6"
#> [2] "198824, 198824, 198824, 198824, 198824, 198824"
#> [3] "123, 1234, 1242, 124, 1241, 1232"
#>
#> $`2`
#> [1] "Var1, Var2, Var3, Var4, Var5, Var6"
#> [2] "x, x, x, x, x, x"
#> [3] "y, y, y, y, y, y"
#> [4] "z, z, z, z, z, z"
#>
#> $`3`
#> [1] "Var1, Var2, Var3, Var4, Var5, Var6"
#> [2] "532523, 25235, 532523, 25235, 532523, 25235"
#> [3] "25332, 5325235, 25332, 5325235, 25332, 5325235"
现在列表的每个元素都可以作为 CSV 读入。
# read in each vector of lines as a CSV
# forcing a default col_type prevents binding errors later
lines_chunked_csvs <- map(lines_chunked_cleaned, ~read_csv(.x, col_types = cols(.default = "c")))
lines_chunked_csvs
#> $`1`
#> # A tibble: 2 x 6
#> Var1 Var2 Var3 Var4 Var5 Var6
#> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 198824 198824 198824 198824 198824 198824
#> 2 123 1234 1242 124 1241 1232
#>
#> $`2`
#> # A tibble: 3 x 6
#> Var1 Var2 Var3 Var4 Var5 Var6
#> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 x x x x x x
#> 2 y y y y y y
#> 3 z z z z z z
#>
#> $`3`
#> # A tibble: 2 x 6
#> Var1 Var2 Var3 Var4 Var5 Var6
#> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 532523 25235 532523 25235 532523 25235
#> 2 25332 5325235 25332 5325235 25332 5325235
使用之前的名称来识别每个数据框并绑定它们。
# name the list of tables, bind everything together
bind_rows(set_names(lines_chunked_csvs, table_id_names), .id = "table")
#> # A tibble: 7 x 7
#> table Var1 Var2 Var3 Var4 Var5 Var6
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 table1 198824 198824 198824 198824 198824 198824
#> 2 table1 123 1234 1242 124 1241 1232
#> 3 table2 x x x x x x
#> 4 table2 y y y y y y
#> 5 table2 z z z z z z
#> 6 table3 532523 25235 532523 25235 532523 25235
#> 7 table3 25332 5325235 25332 5325235 25332 5325235
推荐阅读
- r - 在 gamm 中使用哪个 qq 图,什么是正确的模型验证?
- azure-functions - 在 azure cosmosdb 中捕获 Azure Blob 存储日志
- r - rollmean 用原始值填充 NA
- php - “wp i18n make-pot”无法生成可翻译文本
- javascript - python - 如何在没有类,ID或名称的情况下在python onclick中单击selenium
- javascript - 无法使用 Javascript Jquery 更新 HTML 文本的元素 ID
- awk - 将最大值除以第二大值
- javascript - 未捕获的类型错误:this.activeArray[i] 未定义
- distribution - 给定一个边际分布和另一组数据,你能削减数据以匹配边际分布吗?
- unity3d - 自定义值滑块统一