首页 > 解决方案 > 组合多个csv的有效方法

问题描述

我有超过 100K 的 CSV(总文件大小超过 150 GB)需要加入。尽管列的顺序可能不匹配并且某些 csv 缺少一些列,但它们都有标准的列名。

现在我刚刚创建了一个数据框,并在每次迭代中不断将每个 csv 的数据框连接起来,以获得一个标准数据框,其中包含我最终打算保存为 csv 的所有列

我尝试使用 1000 个样本 csv 制作数据帧,并注意到随着数据帧大小的增加,迭代次数从每秒 10 次下降到每秒 1.5 次,这可能意味着如果我全押 100k csv,它将遵循类似的趋势,从而采取几天,如果不是几个月来组合它们。

有没有更好的方法来组合大量的 csv 文件?

这是我的代码

df_t1 = pd.DataFrame()  
for i in tqdm(range(len(excelNames))):
    thisCSV = str(excelNames[i]).lower().strip()
    df = pd.read_csv(pathxl + "\\" + thisCSV, error_bad_lines=False, warn_bad_lines=False,low_memory=False)
    df["File Name"] = pd.Series([thisCSV for x in range(len(df.index))])
    if thisCSV.endswith('type1.csv'):
        df_t1 = pd.concat([df_t1,df], axis=0, ignore_index=True)  
df_t1.to_csv(outpath + "df_t1.csv", index = None, header=True, encoding='utf-8') 
print("df_t1.csv generated")  

标签: pythonpandascsv

解决方案


可能的改进

方法一:使用熊猫

#df_t1 = pd.DataFrame()  
df_t1_lst = []
for i in tqdm(range(len(excelNames))):
    thisCSV = str(excelNames[i]).lower().strip()
 
    if thisCSV.endswith('type1.csv'):
        df = pd.read_csv(pathxl + "\\" + thisCSV, error_bad_lines=False, warn_bad_lines=False,low_memory=False)
        #df["File Name"] = pd.Series([thisCSV for x in range(len(df.index))]) --unnecessary to loop use next line instead
        df["File Name"] = thisCSV             # places thisCSV in every row
        #df_t1 = pd.concat([df_t1,df], axis=0, ignore_index=True)          # concat slow, append to list instead
        df_t1_lst.append(df)
        
df_t1 = pd.concat(df_t1_lst, ignore_index=True)       # Form dataframe from list (faster than pd.concat in loop)
df_t1.to_csv(outpath + "df_t1.csv", index = None, header=True, encoding='utf-8') 
print("df_t1.csv generated") 

方法 1a

使用 Pandas 连续追加到 CSV 输出文件

import os
import pandas as pd

def str_to_bytes(s):
    ' String to byte array '
    result = bytearray()
    result.extend(map(ord, s))
    return result


def good_file(file_path):
    """ Check if file exists and is not empty"""
    # Check if file exist and it is empty
    return os.path.exists(file_path) and os.stat(file_path).st_size > 0
    
SEPARATOR = ','   # Separator used by CSV file
write_header = True

pathxl = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
outpath = 'xxxxxxxxxxxxxxxxxxxxxxxxxx'
excelNames = ["xxx.csv", "xxxxx.csv"]

pathxl = r"C:\\Users\\darryl\\OneDrive\\Python"
outpath = pathxl + r"\\"
excelNames = ["test1_type1.csv", "test2_type1.csv"]

output_file = outpath + "df_t1.csv"
with open(output_file, "w") as ofile:
    pass   # create empty output file

for i in tqdm(range(len(excelNames))):
    thisCSV = str(excelNames[i]).lower().strip()

    input_file = pathxl + "\\" + thisCSV
    if thisCSV.endswith('type1.csv') and good_file(input_file):
        df = pd.read_csv(input_file)
        if df.shape[0] > 0:
            df['File Name'] = thisCSV                # Add filename
            df = df.sort_index(axis = 1)             # sort based upon colunn in ascending order

            # Append to output file
            df.to_csv(output_file, mode='a', 
                      index = False, 
                      header= write_header)

            write_header = False                     # Only write header once
            
        del df
        

方法 2:二进制文件

读/写二进制文件和使用内存映射应该更快。

from tqdm import tqdm
import os
import mmap

def str_to_bytes(s):
    ' String to byte array '
    result = bytearray()
    result.extend(map(ord, s))
    return result

def good_file(file_path):
    """ Check if file exists and is not empty"""
    # Check if file exist and it is empty
    return os.path.exists(file_path) and os.stat(file_path).st_size > 0
    
SEPARATOR = ','   # Separator used by CSV file
header = None

pathxl = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
outpath = 'xxxxxxxxxxxxxxxxxxxxxxxxxx'
excelNames = ["xxx.csv", "xxxxx.csv"]

with open(outpath + "df_t1.csv", "wb") as ofile:
    for i in tqdm(range(len(excelNames))):
        thisCSV = str(excelNames[i]).lower().strip()

        input_file = pathxl + "\\" + thisCSV
        if thisCSV.endswith('type1.csv') and good_file(input_file):
            with open(input_file, "rb") as ifile:
                print('file ', thisCSV)
                # memory-map the file, size 0 means whole file
                with mmap.mmap(ifile.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_obj:
                    text_iter = iter(mmap_obj.read().split(b'\n'))

                if header is None:
                    header = next(text_iter)
                    header = header.rstrip() +  str_to_bytes(SEPARATOR + "File Name\n")
                    ofile.write(header)  # write header
                else:
                    next(text_iter)        # ignore header row

                # write data to output file
                file_value =  str_to_bytes(SEPARATOR + f"{thisCSV}\n")
                for line in text_iter:
                    if line.strip():             # skip blank lines
                        ofile.write(line.rstrip() + file_value)

推荐阅读