首页 > 解决方案 > Python Pandas - 根据字符串值解析 CSV 文件中的行

问题描述

我想知道是否有一种方法可以使用 Pandas 遍历 CSV 文件中的每一行来确定是否在该行中找到了一个单词(类似于在 Linux 系统中使用 grep)。找到单词在哪一列并不重要,只要找到单词,就会解析整行。我发现了 iterrows() 函数,但我读到如果文件将包含超过 1000 行并且我的程序可能读取超过 100,000 行,则使用此方法效率非常低。非常感谢任何建议!

#Code was tested using Python v3.9.5
import os
import pandas as pd

def parse_row(grep_value):

    global import_file_path
    global export_file_path

    #Initializers loop counter for folder name
    folder_counter = 0
    path = os.path.join(export_file_path, "File Parser Exports")

    #Creates extra directory if current directory exists
    while os.path.isdir(path): 

        #Appends a number to the name of the folder
        folder_counter += 1
        path = os.path.join(export_file_path, "File Parser Exports" + " (" + str(folder_counter) + ")")

    #Creates folder for exports after finding a folder name that is available
    os.mkdir(path)

    #Export file path for parsed file
    full_export_path = path + "\Export.csv"

    file_count = 0    #Initializer for file number of exported files
    tmp_export_path = full_export_path    #Temporary place holder for slicing export path

    #Reads file with headers
    file_data = pd.read_csv(import_file_path, lineterminator='\n')

    #Iterate through file
    for index, row in file_data.iterrows():
        print(index)
        print(row)

    #Checks if export file exists in the newly created directory
    while os.path.isfile(full_export_path):
        
        #Appends a number to the file name
        file_count += 1
        tmp_export_path = tmp_export_path.rsplit('.', 1)[0]
        file_name = "-" + str(file_count) + ".csv"
        full_export_path = tmp_export_path + file_name

    #Exports file after finding a file name that is available
    file_data.to_csv(full_export_path, index=False)

    print()
    print("File(s) exported to \"" + path + "\"")
    print("Successfully completed!")

export_file_path = "C:\\Users\\exportpath"
import_file_path = "C:\\Users\\importpath"
grep_value = "The"

parse_row(grep_value)

标签: python-3.xpandas

解决方案


尝试这样的事情:

cols = df.columns.tolist()
df['flag'] = False
# iterate by column, faster than iterate rows
for col in df[cols]:
    df['flag'] |= df[col].str.contains('your_str')

推荐阅读