python - 组合多个csv的有效方法
问题描述
我有超过 100K 的 CSV(总文件大小超过 150 GB)需要加入。尽管列的顺序可能不匹配并且某些 csv 缺少一些列,但它们都有标准的列名。
现在我刚刚创建了一个数据框,并在每次迭代中不断将每个 csv 的数据框连接起来,以获得一个标准数据框,其中包含我最终打算保存为 csv 的所有列
我尝试使用 1000 个样本 csv 制作数据帧,并注意到随着数据帧大小的增加,迭代次数从每秒 10 次下降到每秒 1.5 次,这可能意味着如果我全押 100k csv,它将遵循类似的趋势,从而采取几天,如果不是几个月来组合它们。
有没有更好的方法来组合大量的 csv 文件?
这是我的代码
df_t1 = pd.DataFrame()
for i in tqdm(range(len(excelNames))):
thisCSV = str(excelNames[i]).lower().strip()
df = pd.read_csv(pathxl + "\\" + thisCSV, error_bad_lines=False, warn_bad_lines=False,low_memory=False)
df["File Name"] = pd.Series([thisCSV for x in range(len(df.index))])
if thisCSV.endswith('type1.csv'):
df_t1 = pd.concat([df_t1,df], axis=0, ignore_index=True)
df_t1.to_csv(outpath + "df_t1.csv", index = None, header=True, encoding='utf-8')
print("df_t1.csv generated")
解决方案
可能的改进
方法一:使用熊猫
#df_t1 = pd.DataFrame()
df_t1_lst = []
for i in tqdm(range(len(excelNames))):
thisCSV = str(excelNames[i]).lower().strip()
if thisCSV.endswith('type1.csv'):
df = pd.read_csv(pathxl + "\\" + thisCSV, error_bad_lines=False, warn_bad_lines=False,low_memory=False)
#df["File Name"] = pd.Series([thisCSV for x in range(len(df.index))]) --unnecessary to loop use next line instead
df["File Name"] = thisCSV # places thisCSV in every row
#df_t1 = pd.concat([df_t1,df], axis=0, ignore_index=True) # concat slow, append to list instead
df_t1_lst.append(df)
df_t1 = pd.concat(df_t1_lst, ignore_index=True) # Form dataframe from list (faster than pd.concat in loop)
df_t1.to_csv(outpath + "df_t1.csv", index = None, header=True, encoding='utf-8')
print("df_t1.csv generated")
方法 1a
使用 Pandas 连续追加到 CSV 输出文件
import os
import pandas as pd
def str_to_bytes(s):
' String to byte array '
result = bytearray()
result.extend(map(ord, s))
return result
def good_file(file_path):
""" Check if file exists and is not empty"""
# Check if file exist and it is empty
return os.path.exists(file_path) and os.stat(file_path).st_size > 0
SEPARATOR = ',' # Separator used by CSV file
write_header = True
pathxl = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
outpath = 'xxxxxxxxxxxxxxxxxxxxxxxxxx'
excelNames = ["xxx.csv", "xxxxx.csv"]
pathxl = r"C:\\Users\\darryl\\OneDrive\\Python"
outpath = pathxl + r"\\"
excelNames = ["test1_type1.csv", "test2_type1.csv"]
output_file = outpath + "df_t1.csv"
with open(output_file, "w") as ofile:
pass # create empty output file
for i in tqdm(range(len(excelNames))):
thisCSV = str(excelNames[i]).lower().strip()
input_file = pathxl + "\\" + thisCSV
if thisCSV.endswith('type1.csv') and good_file(input_file):
df = pd.read_csv(input_file)
if df.shape[0] > 0:
df['File Name'] = thisCSV # Add filename
df = df.sort_index(axis = 1) # sort based upon colunn in ascending order
# Append to output file
df.to_csv(output_file, mode='a',
index = False,
header= write_header)
write_header = False # Only write header once
del df
方法 2:二进制文件
读/写二进制文件和使用内存映射应该更快。
from tqdm import tqdm
import os
import mmap
def str_to_bytes(s):
' String to byte array '
result = bytearray()
result.extend(map(ord, s))
return result
def good_file(file_path):
""" Check if file exists and is not empty"""
# Check if file exist and it is empty
return os.path.exists(file_path) and os.stat(file_path).st_size > 0
SEPARATOR = ',' # Separator used by CSV file
header = None
pathxl = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
outpath = 'xxxxxxxxxxxxxxxxxxxxxxxxxx'
excelNames = ["xxx.csv", "xxxxx.csv"]
with open(outpath + "df_t1.csv", "wb") as ofile:
for i in tqdm(range(len(excelNames))):
thisCSV = str(excelNames[i]).lower().strip()
input_file = pathxl + "\\" + thisCSV
if thisCSV.endswith('type1.csv') and good_file(input_file):
with open(input_file, "rb") as ifile:
print('file ', thisCSV)
# memory-map the file, size 0 means whole file
with mmap.mmap(ifile.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
text_iter = iter(mmap_obj.read().split(b'\n'))
if header is None:
header = next(text_iter)
header = header.rstrip() + str_to_bytes(SEPARATOR + "File Name\n")
ofile.write(header) # write header
else:
next(text_iter) # ignore header row
# write data to output file
file_value = str_to_bytes(SEPARATOR + f"{thisCSV}\n")
for line in text_iter:
if line.strip(): # skip blank lines
ofile.write(line.rstrip() + file_value)
推荐阅读
- tensorflow - Tensorflow:tf.argmax() 作为预测还是最大值?
- mysql - Laravel:遍历连接集合
- sql - 根据 sql 中特定 id 的总和值查找行
- python-2.7 - 按下按钮保存时如何从另一个字段中获取值
- ruby-on-rails - 将 splat 属性从 Slim 转换为 ERB
- g++ - 在 cmake 时,在 '&&' 标记之前预期 ',' 或 '...',但在纯 g++ 编译时不会
- qt - 使用“-platform offscreen”参数运行的 Qt 应用程序无法建立 websocket 连接
- mysql - 如何在不重新启动服务器的情况下增加 max_allowed_packet 大小
- php - 如何以顺序模式对 PHP 数组进行数字排序
- python - 我被困在 while True 循环中