首页 > 解决方案 > 分组并计算折叠的auc

问题描述

根据下面的数据集,我想做的是计算每个算法的 AUC,然后再计算每个数据集的 AUC。我尝试过这样的事情,但它不起作用:

from sklearn.metrics import roc_auc_score,roc_curve,scorer
import pandas as pd

test = pd.DataFrame(dico)
def auc_group(y_hat, y):
    return roc_auc_score(y_hat, y)

test.groupby(["Dataset", "Algo"]).apply(auc_group)

稍后我想做同样的操作,但在 KFolds 的折叠上,这将只是 groupby 的另一层

from sklearn.metrics import roc_auc_score,roc_curve,scorer
import pandas as pd

test = pd.DataFrame(dico)
def auc_group(y_hat, y):
    return roc_auc_score(y_hat, y)

test.groupby(["Dataset", "Algo", "Folds"]).apply(auc_group)

这是数据

dico = {'Dataset': {0: 'UCI',
  1: 'UCI',
  2: 'UCI',
  3: 'UCI',
  4: 'UCI',
  5: 'UCI',
  6: 'UCI',
  7: 'UCI',
  8: 'UCI',
  9: 'UCI',
  10: 'UCI',
  11: 'UCI',
  12: 'UCI',
  13: 'UCI',
  14: 'UCI',
  15: 'UCI',
  16: 'UCI',
  17: 'UCI',
  18: 'UCI',
  19: 'UCI',
  20: 'UCI',
  21: 'UCI',
  22: 'UCI',
  23: 'UCI',
  24: 'UCI',
  25: 'UCI',
  26: 'UCI',
  27: 'UCI',
  28: 'UCI',
  29: 'UCI',
  30: 'UCI',
  31: 'UCI',
  32: 'UCI',
  33: 'UCI',
  34: 'UCI',
  35: 'UCI',
  36: 'UCI',
  37: 'UCI',
  38: 'UCI',
  39: 'UCI'},
 'Algo': {0: 'Gnb',
  1: 'Gnb',
  2: 'Gnb',
  3: 'Gnb',
  4: 'Gnb',
  5: 'Gnb',
  6: 'Gnb',
  7: 'Gnb',
  8: 'Gnb',
  9: 'Gnb',
  10: 'Gnb',
  11: 'Gnb',
  12: 'Gnb',
  13: 'Gnb',
  14: 'Gnb',
  15: 'Gnb',
  16: 'Gnb',
  17: 'Gnb',
  18: 'Gnb',
  19: 'Gnb',
  20: 'LR',
  21: 'LR',
  22: 'LR',
  23: 'LR',
  24: 'LR',
  25: 'LR',
  26: 'LR',
  27: 'LR',
  28: 'LR',
  29: 'LR',
  30: 'LR',
  31: 'LR',
  32: 'LR',
  33: 'LR',
  34: 'LR',
  35: 'LR',
  36: 'LR',
  37: 'LR',
  38: 'LR',
  39: 'LR'},
 'p(y=1)': {0: 0.008566693461697914,
  1: 0.023329740200720657,
  2: 0.013079244223084688,
  3: 0.0035655899487093525,
  4: 0.5412516864202239,
  5: 0.02437104068449619,
  6: 0.0015772504872503706,
  7: 0.01976775149918856,
  8: 0.02580128697308947,
  9: 0.052349648267671536,
  10: 0.016115492810474592,
  11: 0.028573206085476182,
  12: 0.9975288953422592,
  13: 0.1281394485094793,
  14: 0.0014564219132441555,
  15: 0.015625393606472308,
  16: 0.15181450609384148,
  17: 0.015221143650194884,
  18: 0.022419878846782183,
  19: 0.9991431483286071,
  20: 0.04281920675218464,
  21: 0.035985853029231185,
  22: 0.05570563548576814,
  23: 0.5468626213371839,
  24: 0.01616233084557819,
  25: 0.025090866736312712,
  26: 0.4368789472788432,
  27: 0.5268969392335681,
  28: 0.06716466142340655,
  29: 0.2093170587100108,
  30: 0.008660602880515709,
  31: 0.10929145816022637,
  32: 0.04069088617214272,
  33: 0.06683143493934368,
  34: 0.06653318086395299,
  35: 0.016010358473692744,
  36: 0.08583523793056999,
  37: 0.044347932186208014,
  38: 0.014208157887412804,
  39: 0.007949785472510792},
 'y_hat': {0: 0,
  1: 0,
  2: 0,
  3: 0,
  4: 1,
  5: 0,
  6: 0,
  7: 0,
  8: 0,
  9: 0,
  10: 0,
  11: 0,
  12: 1,
  13: 0,
  14: 0,
  15: 0,
  16: 0,
  17: 0,
  18: 0,
  19: 1,
  20: 0,
  21: 0,
  22: 0,
  23: 1,
  24: 0,
  25: 0,
  26: 0,
  27: 1,
  28: 0,
  29: 0,
  30: 0,
  31: 0,
  32: 0,
  33: 0,
  34: 0,
  35: 0,
  36: 0,
  37: 0,
  38: 0,
  39: 0},
 'y': {0: 0,
  1: 0,
  2: 0,
  3: 0,
  4: 0,
  5: 0,
  6: 0,
  7: 0,
  8: 0,
  9: 0,
  10: 0,
  11: 0,
  12: 1,
  13: 1,
  14: 0,
  15: 0,
  16: 0,
  17: 0,
  18: 0,
  19: 1,
  20: 0,
  21: 0,
  22: 0,
  23: 1,
  24: 0,
  25: 0,
  26: 0,
  27: 0,
  28: 0,
  29: 0,
  30: 0,
  31: 0,
  32: 0,
  33: 0,
  34: 0,
  35: 0,
  36: 0,
  37: 0,
  38: 0,
  39: 0}}

这是错误消息:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in apply(self, func, *args, **kwargs)
    724             try:
--> 725                 result = self._python_apply_general(f)
    726             except Exception:

/opt/conda/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in _python_apply_general(self, f)
    741     def _python_apply_general(self, f):
--> 742         keys, values, mutated = self.grouper.apply(f, self._selected_obj, self.axis)
    743 

/opt/conda/lib/python3.7/site-packages/pandas/core/groupby/ops.py in apply(self, f, data, axis)
    236             group_axes = _get_axes(group)
--> 237             res = f(group)
    238             if not _is_indexed_like(res, group_axes):

TypeError: auc_group() missing 1 required positional argument: 'y'

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
<ipython-input-23-eab997668f67> in <module>
      2     return roc_auc_score(y_hat, y)
      3 
----> 4 test.groupby(["Dataset", "Algo"]).apply(auc_group)

/opt/conda/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in apply(self, func, *args, **kwargs)
    735 
    736                 with _group_selection_context(self):
--> 737                     return self._python_apply_general(f)
    738 
    739         return result

/opt/conda/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in _python_apply_general(self, f)
    740 
    741     def _python_apply_general(self, f):
--> 742         keys, values, mutated = self.grouper.apply(f, self._selected_obj, self.axis)
    743 
    744         return self._wrap_applied_output(

/opt/conda/lib/python3.7/site-packages/pandas/core/groupby/ops.py in apply(self, f, data, axis)
    235             # group might be modified
    236             group_axes = _get_axes(group)
--> 237             res = f(group)
    238             if not _is_indexed_like(res, group_axes):
    239                 mutated = True

TypeError: auc_group() missing 1 required positional argument: 'y'

标签: pythonpandasscikit-learnsklearn-pandas

解决方案


您的代码的问题是 apply 中的函数 auc_group 应该将整个数据帧而不是其中的某些部分作为输入。像下面这样更改 auc_group 应该可以解决问题:

def auc_group(df):
    y_hat = df.y_hat
    y = df.y
    return roc_auc_score(y_hat, y)

有了这个变化和你的数据,

test.groupby(["Dataset", "Algo"]).apply(auc_group)

生产

Dataset  Algo
UCI      Gnb     0.803922
         LR      0.750000
dtype: float64

推荐阅读