首页 > 解决方案 > 将当前行与所有先前行进行比较

问题描述

对于df


    id          Date        ITEM_ID TYPE    GROUP
0   13710750    2019-07-01  SLM607  O       X
1   13710760    2019-07-01  SLM607  O       M
2   13710770    2019-07-03  SLM607  O       I
3   13710780    2019-09-03  SLM607  O       N
4   13667449    2019-08-02  887643  O       I
5   13667450    2019-08-02  792184  O       I
6   13728171    2019-09-17  SLM607  I       I
7   13667452    2019-08-02  794580  O       I
... ... ... ... ... ... ... ... ... ...

有可重复的例子:

data = {'id': [13710750, 13710760, 13710770, 13710780, 13667449, 13667450, 13728171, 13667452],
        'Date': ['2019-07-01', '2019-07-01', '2019-07-03', '2019-09-03', '2019-08-02', '2019-08-02', '2019-09-17', '2019-08-02'],
        'ITEM_ID': ['SLM607', 'SLM607', 'SLM607', 'SLM607', '887643', '792184', 'SLM607', '794580'],
        'TYPE': ['O', 'O', 'O', 'O', 'O', 'O', 'I', 'O'],
        'GROUP': ['X', 'M', 'I','N','I','I','I', 'I']}

df = pd.DataFrame(data)
df

在检查是否有任何行满足以下条件之前,我想通过ITEM_ID将当前元素与所有行进行比较来循环:ITEM_ID

(1) 与ITEM_ID当前行相同;和

(2) 如果当前行有TYPE== I并且 (1) 中的前一行有TYPE== O,则删除这两行。

什么是有效的方法来做到这一点?


试过:

l = [test.loc[x,'ITEM_ID'] in test.loc[0:x,'ITEM_ID'].tolist() for x in np.arange(0,len(test))]
test['New']=l
test

过滤具有相同“ITEM_ID”的行,但最终True每行都有。我不确定出了什么问题,以及这是否是正确的方法。


更新:

@Reza 的解决方案

df.groupby('ITEM_ID')['TYPE'].apply(lambda x: (x == 'I') & (x.shift() == 'O'))

似乎工作。我想添加一个附加条件,即 same GROUP

试过:

df.loc[df.groupby('ITEM_ID').apply(lambda x: (x['TYPE'] == 'I') & (x['TYPE'].shift() == 'O') & (x['GROUP'] == x['GROUP'].shift()))]

但发现错误:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-177-f68c5ffc70cc> in <module>
----> 1 df.loc[df.groupby('ITEM_ID').apply(lambda x: (x['TYPE'] == 'I') & (x['TYPE'].shift() == 'O') & (x['GROUP'] == x['GROUP'].shift()))]

~\Anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
   1765 
   1766             maybe_callable = com.apply_if_callable(key, self.obj)
-> 1767             return self._getitem_axis(maybe_callable, axis=axis)
   1768 
   1769     def _is_scalar_access(self, key: Tuple):

~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
   1911             return self._get_slice_axis(key, axis=axis)
   1912         elif com.is_bool_indexer(key):
-> 1913             return self._getbool_axis(key, axis=axis)
   1914         elif is_list_like_indexer(key):
   1915 

~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _getbool_axis(self, key, axis)
   1779         # caller is responsible for ensuring non-None axis
   1780         labels = self.obj._get_axis(axis)
-> 1781         key = check_bool_indexer(labels, key)
   1782         inds = key.nonzero()[0]
   1783         return self.obj._take_with_is_copy(inds, axis=axis)

~\Anaconda3\lib\site-packages\pandas\core\indexing.py in check_bool_indexer(index, key)
   2311     result = key
   2312     if isinstance(key, ABCSeries) and not key.index.equals(index):
-> 2313         result = result.reindex(index)
   2314         mask = isna(result._values)
   2315         if mask.any():

~\Anaconda3\lib\site-packages\pandas\core\series.py in reindex(self, index, **kwargs)
   4028     @Appender(generic.NDFrame.reindex.__doc__)
   4029     def reindex(self, index=None, **kwargs):
-> 4030         return super().reindex(index=index, **kwargs)
   4031 
   4032     def drop(

~\Anaconda3\lib\site-packages\pandas\core\generic.py in reindex(self, *args, **kwargs)
   4542         # perform the reindex on the axes
   4543         return self._reindex_axes(
-> 4544             axes, level, limit, tolerance, method, fill_value, copy
   4545         ).__finalize__(self)
   4546 

~\Anaconda3\lib\site-packages\pandas\core\generic.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
   4557             ax = self._get_axis(a)
   4558             new_index, indexer = ax.reindex(
-> 4559                 labels, level=level, limit=limit, tolerance=tolerance, method=method
   4560             )
   4561 

~\Anaconda3\lib\site-packages\pandas\core\indexes\multi.py in reindex(self, target, method, level, limit, tolerance)
   2423             else:
   2424                 # hopefully?
-> 2425                 target = MultiIndex.from_tuples(target)
   2426 
   2427         if (

~\Anaconda3\lib\site-packages\pandas\core\indexes\multi.py in from_tuples(cls, tuples, sortorder, names)
    487                 tuples = tuples._values
    488 
--> 489             arrays = list(lib.tuples_to_object_array(tuples).T)
    490         elif isinstance(tuples, list):
    491             arrays = list(lib.to_object_array_tuples(tuples).T)

pandas\_libs\lib.pyx in pandas._libs.lib.tuples_to_object_array()

ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long long'

标签: pythonpandasdataframe

解决方案


尝试以下返回布尔系列:

df.groupby('ITEM_ID')['TYPE'].apply(lambda x: (x == 'I') & (x.shift() == 'O'))
0    False
1    False
2    False
3     True
4    False
Name: TYPE, dtype: bool

对于第二种情况,您可以使用:

myfilter = lambda x: (x['TYPE'] == 'I') & (x['TYPE'].shift() == 'O') & (x['GROUP'] == x['GROUP'].shift())
df.groupby('ITEM_ID').apply(myfilter).reset_index('ITEM_ID', drop=True).sort_index()

推荐阅读