首页 > 解决方案 > Python外连接

问题描述

下面的代码用于计算统计值。

import re
from pathlib import Path
import pandas as pd

def prepare_values(df):
    df_columns = ['frame.time_delta_displayed', 'frame.len']
    df_values = []
    for col in df_columns:
        df_values +=[
            df[col].max(),
            df[col].min(),
            df[col].std(),
            df[col].quantile(0.25),
            df[col].quantile(0.5),
            df[col].quantile(0.75),
            df[col].mean(),
            df[col].mad(),
            df[col].var(),
            df[col].skew(),
            df[col].kurtosis(),
            df[col].sum(),
        ]
    return df_values


source_dir = Path('/media/root/HASARA/Snipping Experiment-App Activities/Time-0.5/InOutFiltered')

in_data = []
for file in source_dir.glob('**/*.in.csv'):
    activity = {'activity': file.stem.split('.')[0]}
    df = pd.read_csv(file)
    cols =['maxTimeIn', 'minTimeIn', 'stdTimeIn', 'q1TimeIn', 'q2TimeIn', 'q3TimeIn', 'meanTimeIn', 'madTimeIn', 'varianceTimeIn', 'skewTimeIn', 'kurtosisTimeIn', 'sumTimeIn', 'maxLenIn', 'minLenIn', 'stdLenIn', 'q1LenIn','q2lenIn', 'q3LenIn', 'meanLenIn', 'madLenIn', 'varianceLenIn', 'skewLenIn', 'kurtosisLenIn', 'sumLenIn']
    values = prepare_values(df)
    file_data ={**activity, **dict(zip(cols,values))}
    in_data.append(file_data)

out_data =[]
for file in source_dir.glob('**/*.out.csv'):
    activity = {'activity': file.stem.split('.')[0]}
    df = pd.read_csv(file)
    cols =['maxTimeOut', 'minTimeOut', 'stdTimeOut', 'q1TimeOut', 'q2TimeOut', 'q3TimeOut', 'meanTimeOut', 'madTimeOut', 'varianceTimeOut', 'skewTimeOut', 'kurtosisTimeOut', 'sumTimeOut', 'maxLenOut', 'minLenOut', 'stdLenOut', 'q1LenOut', 'q2LenOut', 'q3LenOut', 'meanLenOut', 'madLenOut', 'varianceLenOut', 'skewLenOut', 'kurtosisLenOut','sumLenOut']
    values=prepare_values(df)
    file_data = {**activity, **dict(zip(cols, values))}
    out_data.append(file_data)

in_df = pd.DataFrame(in_data)
out_df = pd.DataFrame(out_data)
all_df = in_df.join(out_df.set_index('activity'), on='activity', how='outer')

all_df.dropna(subset=all_df.columns.tolist()[1:], how='all', inplace=True)
all_df.fillna(0, inplace=True)
all_df['activity'] = all_df['activity'].apply(lambda x:re.sub(r'^([a-zA-Z]+).*', r'\1',x))
all_df.to_csv('/media/root/HASARA/Snipping Experiment-App Activities/Time-0.5/AllDataNew.csv', index=False)

我收到一个错误。想不通是什么意思。

Traceback (most recent call last):
  File "/root/PycharmProjects/AppAct/StatisticCal.py", line 48, in <module>
    all_df= in_df.join(out_df.set_index('activity'), on='activity', how='outer')
  File "/root/PycharmProjects/AppAct/venv/lib/python3.7/site-packages/pandas/core/frame.py", line 4178, in set_index
    level = frame[col]._values
  File "/root/PycharmProjects/AppAct/venv/lib/python3.7/site-packages/pandas/core/frame.py", line 2927, in __getitem__
    indexer = self.columns.get_loc(key)
  File "/root/PycharmProjects/AppAct/venv/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2659, in get_loc
    return self._engine.get_loc(self._maybe_cast_indexer(key))
  File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'activity'

标签: pythonpython-3.xpandas

解决方案


推荐阅读