首页 > 解决方案 > 如何优化处理 pandas DataFrame 并提高代码的可读性?

问题描述

下面的代码使用一些虚拟变量处理 pandas DataFrame 中的标志。对于包含 4000 行的数据集,代码似乎非常慢。

代码:

import pandas as pd
def dummy():
    df=pd.read_csv('DF.csv',header=0)
    df['North']=0
    df['South']=0
    df['Central']=0
    df['West']=0
    df['East']=0
    df['Remote']=0
    for i in range(len(df['Region'])):
        if df['Region'][i]=='North':
            df['North'][i]=1
        elif df['Region'][i]=='South':
            df['South'][i]=1
        elif df['Region'][i]=='East':
            df['East'][i]=1
        elif df['Region'][i]=='West':
            df['West'][i]=1
        elif df['Region'][i]=='Central':
            df['Central'][i]=1
        elif df['Region'][i]=='Remote':
            df['Remote'][i]=1
    df['apartment']=0
    df['house']=0
    df['townhouse']=0
    df['unit']=0
    df['villa']=0
    df['acreage']=0
    df['other']=0
    for i in range(len(df['Type_Property'])):
        if df['Type_Property'][i]=='house':
            df['house'][i]=1
        elif df['Type_Property'][i]=='apartment' or df['Type_Property'][i]=='flat':
            df['apartment'][i]=1
        elif df['Type_Property'][i]=='townhouse':
            df['townhouse'][i]=1
        elif df['Type_Property'][i]=='villa':
            df['villa'][i]=1
        elif df['Type_Property'][i]=='acreage+semi+rural' or df['Type_Property'][i]=='terrance':
            df['acreage'][i]=1
        elif df['Type_Property'][i]=='unit':
            df['unit'][i]=1
        else:
            df['other']=1
    df.to_csv('Dummied.csv')

dummy()

标签: pythonpandas

解决方案


第一部分

df['North']=0
df['South']=0
df['Central']=0
df['West']=0
df['East']=0
df['Remote']=0
df['apartment']=0
df['house']=0
df['townhouse']=0
df['unit']=0
df['villa']=0
df['acreage']=0
df['other']=0

变成

fields = ['North', 'South', 'Central', 'West', 'East', 'Remote', 'apartment', 'house', 'townhouse', 'unit', 'villa', 'acreage', 'other']
for field in fields:
    df[ field ] = 0

这部分

for i in range(len(df['Region'])):
        if df['Region'][i]=='North':
            df['North'][i]=1
        elif df['Region'][i]=='South':
            df['South'][i]=1
        elif df['Region'][i]=='East':
            df['East'][i]=1
        elif df['Region'][i]=='West':
            df['West'][i]=1
        elif df['Region'][i]=='Central':
            df['Central'][i]=1
        elif df['Region'][i]=='Remote':
            df['Remote'][i]=1

可以替换为

for i in range(len(df['Region'])):
        df[ df['Region'][i] ][i] = 1

另一个 for 循环

for i in range(len(df['Type_Property'])):
        if df['Type_Property'][i]=='house':
            df['house'][i]=1
        elif df['Type_Property'][i]=='apartment' or df['Type_Property'][i]=='flat':
            df['apartment'][i]=1
        elif df['Type_Property'][i]=='townhouse':
            df['townhouse'][i]=1
        elif df['Type_Property'][i]=='villa':
            df['villa'][i]=1
        elif df['Type_Property'][i]=='acreage+semi+rural' or df['Type_Property'][i]=='terrance':
            df['acreage'][i]=1
        elif df['Type_Property'][i]=='unit':
            df['unit'][i]=1
        else:
            df['other']=1

可能

for i in range(len(df['Type_Property'])):
    if df['Type_Property'][i] in ['house', 'apartment', 'townhouse', 'villa', 'unit']:
        df[ df['Type_Property'][i] ][i] = 1
    elif df['Type_Property'][i]=='flat':
        df['apartment'][i]=1
    elif df['Type_Property'][i]=='acreage+semi+rural' or df['Type_Property'][i]=='terrance':
        df['acreage'][i]=1
    else:
        df['other']=1

推荐阅读