首页 > 解决方案 > 计算数据框中连续相同的数字

问题描述

我正在尝试计算以下此 Dataframe 中连续相同的值:

数据框

东风:

{'obligacion': {0: 200000000123,  1: 200000000123,  2: 200000000123,  3: 200000000123,  4: 00000000123,  5: 200000000123,  6: 200000000123,  7: 200000000123,  8: 200000000123,  9: 200000000123,  0: 200000000123,  11: 200000000123,  12: 200000000123,  13: 200000000123,  14: 200000000123,  15: 00000000123,  16: 200000000123,  17: 200000000123}, 0: {0: 'mora18',  1: 'mora17',  2: 'mora16',  3: mora15',  4: 'mora14',  5: 'mora13',  6: 'mora12',  7: 'mora11',  8: 'mora10',  9: 'mora9',  10: 'mora8',  1: 'mora7',  12: 'mora6',  13: 'mora5',  14: 'mora4',  15: 'mora3',  16: 'mora2',  17: 'mora1'}, dias_mora': {0: '-1',  1: '-1',  2: '-1',  3: '-1',  4: '-1',  5: '-1',  6: '-1',  7: '4',  8: '6',  9: 0',  10: '8',  11: '9',  12: '7',  13: '10',  14: '3',  15: '2',  16: '3',  17: '2'}}

所以我想要这样的输出:

[200000000123: (-1, 7), (4, 1), (6, 1), (0, 1), (8, 1), (9, 1), (7, 1), (10, 1 ), (3, 1), (2, 1), (3, 1), (2, 1)]

ID 号后跟数字的连续计数,例如:

数字 -1 连续重复 7 次

- - - - - - - -更新 - - - - - - - - -

使用的代码

import pandas as pd 
data = {
 'obligacion': [200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123] +
               [200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444],
 '0': [ 'mora18', 'mora17', 'mora16', 'mora15', 'mora14', 'mora13', 'mora12', 'mora11', 'mora10', 'mora9', 'mora8', 'mora7', 'mora6', 'mora5', 'mora4', 'mora3', 'mora2', 'mora1'] +
      [ 'kiwi18', 'kiwi17', 'kiwi16', 'kiwi15', 'kiwi14', 'kiwi13', 'kiwi12', 'kiwi11', 'kiwi10', 'kiwi9', 'kiwi8', 'kiwi7', 'kiwi6', 'kiwi5', 'kiwi4', 'kiwi3', 'kiwi2', 'kiwi1'], 
 'dias_mora': [ '-1', '-1', '-1', '-1', '-1', '-1', '-1', '4', '6', '0', '8', '9', '7', '10', '3', '2', '3', '2'] +
              [ '12', '0', '4', '4', '4', '7', '10', '4', '-6', '-7', '8', '8', '17', '10', '10', '-2', '3', '2']
}

df = pd.DataFrame.from_dict(data)  # convert dictionary to dataframe

dict_count = {}
for nid in df.obligacion.unique():
    vector_mora = df['dias_mora'][df.obligacion == nid].values
    groups = groupby(vector_mora)
    result = [(label, sum(1 for _ in group)) for label, group in groups] 
    dict_count[nid] = result
dict_count

标签: pythonpython-3.x

解决方案


我找不到只用 pandas 的聪明方法,所以需要一个列表和循环。

import pandas as pd

data = {
     'obligacion': [200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123] +
                   [200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444],
     '0': [ 'mora18', 'mora17', 'mora16', 'mora15', 'mora14', 'mora13', 'mora12', 'mora11', 'mora10', 'mora9', 'mora8', 'mora7', 'mora6', 'mora5', 'mora4', 'mora3', 'mora2', 'mora1'] +
          [ 'kiwi18', 'kiwi17', 'kiwi16', 'kiwi15', 'kiwi14', 'kiwi13', 'kiwi12', 'kiwi11', 'kiwi10', 'kiwi9', 'kiwi8', 'kiwi7', 'kiwi6', 'kiwi5', 'kiwi4', 'kiwi3', 'kiwi2', 'kiwi1'], 
     'dias_mora': [ '-1', '-1', '-1', '-1', '-1', '-1', '-1', '4', '6', '0', '8', '9', '7', '10', '3', '2', '3', '2'] +
                  [ '12', '0', '4', '4', '4', '7', '10', '4', '-6', '-7', '8', '8', '17', '10', '10', '-2', '3', '2']
}

df = pd.DataFrame.from_dict(data)  # convert dictionary to dataframe
lob = df['obligacion'].unique().tolist()   # distinct list of first columne
ddall = {}
for o in lob:  # each ob
    ldm = df[df['obligacion']==o]['dias_mora'].tolist()  # filter by ob, convert last column to list

    all = []
    cnt = 0
    for i in range(len(ldm)-1): # each element in list
       cnt += 1
       if ldm[i] != ldm[i+1]:   # if last element in this sequence
          all.append((ldm[i],cnt))  # append tuple to final list
          cnt = 0
    else:
       all.append((ldm[i+1],cnt+1))  # last element
    ddall[o] = [(int(e[0]),e[1]) for e in list(all)]

print(ddall)

输出

{
  200000000123: [(-1, 7), (4, 1), (6, 1), (0, 1), (8, 1), (9, 1), (7, 1), (10, 1), (3, 1), (2, 1), (3, 1), (2, 1)], 
  200000000444: [(12, 1), (0, 1), (4, 3), (7, 1), (10, 1), (4, 1), (-6, 1), (-7, 1), (8, 2), (17, 1), (10, 2), (-2, 1), (3, 1), (2, 1)]
}

- - 更新 - -

根据 Pandas 文档,应该避免迭代数据帧,因为它非常慢。为了加快这个脚本的速度,我将键列转换为列表,压缩它们并遍历 zip 对象。该脚本的运行速度大约是原来的两倍。输出是一样的。

这是更快的脚本:

import pandas as pd

data = {
     'obligacion': [200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123] +
                   [200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444],
     '0': [ 'mora18', 'mora17', 'mora16', 'mora15', 'mora14', 'mora13', 'mora12', 'mora11', 'mora10', 'mora9', 'mora8', 'mora7', 'mora6', 'mora5', 'mora4', 'mora3', 'mora2', 'mora1'] +
          [ 'kiwi18', 'kiwi17', 'kiwi16', 'kiwi15', 'kiwi14', 'kiwi13', 'kiwi12', 'kiwi11', 'kiwi10', 'kiwi9', 'kiwi8', 'kiwi7', 'kiwi6', 'kiwi5', 'kiwi4', 'kiwi3', 'kiwi2', 'kiwi1'], 
     'dias_mora': [ '-1', '-1', '-1', '-1', '-1', '-1', '-1', '4', '6', '0', '8', '9', '7', '10', '3', '2', '3', '2'] +
                  [ '12', '0', '4', '4', '4', '7', '10', '4', '-6', '-7', '8', '8', '17', '10', '10', '-2', '3', '2']
}

df = pd.DataFrame.from_dict(data)  # convert dictionary to dataframe

# convert key columns to lists for faster scan
lstob = df['obligacion'].to_list()
lstdm = df['dias_mora'].to_list()

ddall = {}
lastob = "___"  # will delete this entry
lst = []
lastv = cnt = 1
tt = zip(lstob, lstdm)  # combine lists for iteration
for t in tt:  # each ob/dm
   if t[0] != lastob:  # new ob
      lst.append((int(lastv), cnt)) # add last sequence
      ddall[lastob] = lst  # add list to dictionary
      lastob = t[0]
      lst = []
      lastv = t[1]
      cnt = 1
   else:  # same ob
      if t[1] != lastv: # if new dm
         lst.append((int(lastv), cnt))
         lastv = t[1]
         cnt = 1
      else:
         cnt += 1 # just increment ctr
else: # last row in dataset
   lst.append((int(t[1]), cnt))
   ddall[lastob] = lst

del ddall['___']  # remove temporary entry
print(ddall)

---- 更新 #2 ----

如果要在输出中添加 dias_mora,可以在计算值时收集 dm 条目。

为此,这是更新的代码:

import pandas as pd

data = {
     'obligacion': [200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123] +
                   [200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444],
     '0': [ 'mora18', 'mora17', 'mora16', 'mora15', 'mora14', 'mora13', 'mora12', 'mora11', 'mora10', 'mora9', 'mora8', 'mora7', 'mora6', 'mora5', 'mora4', 'mora3', 'mora2', 'mora1'] +
          [ 'kiwi18', 'kiwi17', 'kiwi16', 'kiwi15', 'kiwi14', 'kiwi13', 'kiwi12', 'kiwi11', 'kiwi10', 'kiwi9', 'kiwi8', 'kiwi7', 'kiwi6', 'kiwi5', 'kiwi4', 'kiwi3', 'kiwi2', 'kiwi1'], 
     'dias_mora': [ '-1', '-1', '-1', '-1', '-1', '-1', '-1', '4', '6', '0', '8', '9', '7', '10', '3', '2', '3', '2'] +
                  [ '12', '0', '4', '4', '4', '7', '10', '4', '-6', '-7', '8', '8', '17', '10', '10', '-2', '3', '2']
}

df = pd.DataFrame.from_dict(data)  # convert dictionary to dataframe

# convert key columns to lists for faster scan
lstob = df['obligacion'].to_list()
lst0  = df['0'].to_list()
lstdm = df['dias_mora'].to_list()

cur0 = ""

ddall = {}
lastob = "___"  # will delete this entry
lst = []
lastv = cnt = 1
tt = zip(lstob, lst0, lstdm)  # combine lists for iteration
for t in tt:  # each ob/dm
   if t[0] != lastob:  # new ob
      lst.append((int(lastv), cnt, cur0)) # add last sequence
      ddall[lastob] = lst  # add list to dictionary
      lastob = t[0]
      lst = []
      lastv = t[2]
      cur0 = t[1]
      cnt = 1
   else:  # same ob
      if t[2] != lastv: # if new dm
         lst.append((int(lastv), cnt, cur0))
         lastv = t[2]
         cur0 = t[1]
         cnt = 1
      else:
         cnt += 1 # just increment ctr
         cur0 += ',' + t[1]
else: # last row in dataset
   lst.append((int(t[2]), cnt, cur0))
   ddall[lastob] = lst

del ddall['___']  # remove temporary entry
print(ddall)

输出(格式化)

{
200000000123: [
    (-1, 7, 'mora18,mora17,mora16,mora15,mora14,mora13,mora12'), 
    (4,  1, 'mora11'), 
    (6,  1, 'mora10'), 
    (0,  1, 'mora9'), 
    (8,  1, 'mora8'), 
    (9,  1, 'mora7'), 
    (7,  1, 'mora6'), 
    (10, 1, 'mora5'), 
    (3,  1, 'mora4'), 
    (2,  1, 'mora3'), 
    (3,  1, 'mora2'), 
    (2,  1, 'mora1')], 
200000000444: [
    (12, 1, 'kiwi18'), 
    (0,  1, 'kiwi17'), 
    (4,  3, 'kiwi16,kiwi15,kiwi14'), 
    (7,  1, 'kiwi13'), 
    (10, 1, 'kiwi12'), 
    (4,  1, 'kiwi11'), 
    (-6, 1, 'kiwi10'), 
    (-7, 1, 'kiwi9'), 
    (8,  2, 'kiwi8,kiwi7'), 
    (17, 1, 'kiwi6'), 
    (10, 2, 'kiwi5,kiwi4'), 
    (-2, 1, 'kiwi3'), 
    (3,  1, 'kiwi2'), 
    (2,  1, 'kiwi1')]
}

split如果需要,可以使用 string 方法将 dm 列表转换为列表。


推荐阅读