首页 > 解决方案 > 测试数据帧处理管道的pythonic方法是什么

问题描述

测试 pandas 数据帧处理链的最佳方法是什么?我删除了脚本文件和下面的测试文件,这样你就可以明白我的意思了。

我对最佳实践感到困惑,我唯一的指导直觉是进行测试,以便它们可以按任何顺序运行,限制从磁盘加载 csv 的次数,同时确保链中的每个点都不会修改夹具. 流程中的每个步骤都依赖于前面的步骤,因此对每个节点进行单元测试就像在管道中测试到该点的处理累积。到目前为止,我正在完成任务,但似乎正在发生很多代码重复,因为我在每次测试中都在逐步构建管道。

测试这种python脚本的方法是什么?

这是删除的数据处理文件:

#main_script.py

def calc_allocation_methodology(df_row):
    print('calculating allocation methodoloyg')
    return 'simple'

def flag_data_for_the_allocation_methodology(df):       
    allocation_methodology = df.apply(calc_allocation_methodology, axis=1)
    df.assign(allocation_methodology=allocation_methodology)
    print('flagging each row for the allocation methodoloyg')
    return df

def convert_repeating_values_to_nan(df):
    'keep one value and nan the rest of the values'
    print('convert repeating values to nan')
    return df

def melt_and_drop_accounting_columns(df):
    print('melt and drop accounting columns')
    print(f'colums remaining: {df.shape[0]}')
    return df
    
def melt_and_drop_engineering_columns(df):
    print('melt and drop engineering columns')
    print(f'colums remaining: {df.shape[0]}')
    return df
    
    
def process_csv_to_tiny_format(df):
    print('process the entire pipeline')
    return (df
        .pipe(flag_data_for_the_allocation_methodology)
        .pipe(convert_repeating_values_to_nan)
        .pipe(melt_and_drop_accounting_columns)
        .pipe(melt_and_drop_engineering_columns)
        )

这是删除的测试文件



#test_main.py


from pytest import fixture
import main_script as main
import pandas as pd

@fixture(scope='session')
def df_from_csv()
    return pd.load_csv('database_dump.csv')

@fixture
def df_copy(df_from_csv):
    df = df_from_csv.copy()
    return df

    
    
def test_expected_flag_data_for_the_allocation_methodology(df_copy)
    df = df_copy
    node_to_test = df.pipe(main.flag_data_for_the_allocation_methodology)
    assert True

def test_convert_repeating_values_to_nan(df_copy)
    df = df_copy
    node_to_test = df.pipe(main.flag_data_for_the_allocation_methodology).pipe(main.convert_repeating_values_to_nan)
    assert True     
        
def test_melt_and_drop_accounting_columns(df_copy)
    df = df_copy
    node_to_test = (df
        .pipe(main.flag_data_for_the_allocation_methodology)
        .pipe(main.convert_repeating_values_to_nan)
        .pipe(main.melt_and_drop_accounting_columns))
    assert True         
    
def test_melt_and_drop_engineering_columns(df_copy)
    df = df_copy
    node_to_test = (df
        .pipe(main.flag_data_for_the_allocation_methodology)
        .pipe(main.convert_repeating_values_to_nan)
        .pipe(main.melt_and_drop_accounting_columns)
        .pipe(main.melt_and_drop_engineering_columns))
    assert True     

def test_process_csv_to_tiny_format(df_from_csv):
    df = df_from_csv.copy()
    tiny_data = main.process_csv_to_tiny_format(df)
    assert True 

标签: pythonpandaspytest

解决方案


推荐阅读