python - 计算数据框中连续相同的数字
问题描述
我正在尝试计算以下此 Dataframe 中连续相同的值:
东风:
{'obligacion': {0: 200000000123, 1: 200000000123, 2: 200000000123, 3: 200000000123, 4: 00000000123, 5: 200000000123, 6: 200000000123, 7: 200000000123, 8: 200000000123, 9: 200000000123, 0: 200000000123, 11: 200000000123, 12: 200000000123, 13: 200000000123, 14: 200000000123, 15: 00000000123, 16: 200000000123, 17: 200000000123}, 0: {0: 'mora18', 1: 'mora17', 2: 'mora16', 3: mora15', 4: 'mora14', 5: 'mora13', 6: 'mora12', 7: 'mora11', 8: 'mora10', 9: 'mora9', 10: 'mora8', 1: 'mora7', 12: 'mora6', 13: 'mora5', 14: 'mora4', 15: 'mora3', 16: 'mora2', 17: 'mora1'}, dias_mora': {0: '-1', 1: '-1', 2: '-1', 3: '-1', 4: '-1', 5: '-1', 6: '-1', 7: '4', 8: '6', 9: 0', 10: '8', 11: '9', 12: '7', 13: '10', 14: '3', 15: '2', 16: '3', 17: '2'}}
所以我想要这样的输出:
[200000000123: (-1, 7), (4, 1), (6, 1), (0, 1), (8, 1), (9, 1), (7, 1), (10, 1 ), (3, 1), (2, 1), (3, 1), (2, 1)]
ID 号后跟数字的连续计数,例如:
数字 -1 连续重复 7 次
- - - - - - - -更新 - - - - - - - - -
使用的代码
import pandas as pd
data = {
'obligacion': [200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123] +
[200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444],
'0': [ 'mora18', 'mora17', 'mora16', 'mora15', 'mora14', 'mora13', 'mora12', 'mora11', 'mora10', 'mora9', 'mora8', 'mora7', 'mora6', 'mora5', 'mora4', 'mora3', 'mora2', 'mora1'] +
[ 'kiwi18', 'kiwi17', 'kiwi16', 'kiwi15', 'kiwi14', 'kiwi13', 'kiwi12', 'kiwi11', 'kiwi10', 'kiwi9', 'kiwi8', 'kiwi7', 'kiwi6', 'kiwi5', 'kiwi4', 'kiwi3', 'kiwi2', 'kiwi1'],
'dias_mora': [ '-1', '-1', '-1', '-1', '-1', '-1', '-1', '4', '6', '0', '8', '9', '7', '10', '3', '2', '3', '2'] +
[ '12', '0', '4', '4', '4', '7', '10', '4', '-6', '-7', '8', '8', '17', '10', '10', '-2', '3', '2']
}
df = pd.DataFrame.from_dict(data) # convert dictionary to dataframe
dict_count = {}
for nid in df.obligacion.unique():
vector_mora = df['dias_mora'][df.obligacion == nid].values
groups = groupby(vector_mora)
result = [(label, sum(1 for _ in group)) for label, group in groups]
dict_count[nid] = result
dict_count
解决方案
我找不到只用 pandas 的聪明方法,所以需要一个列表和循环。
import pandas as pd
data = {
'obligacion': [200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123] +
[200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444],
'0': [ 'mora18', 'mora17', 'mora16', 'mora15', 'mora14', 'mora13', 'mora12', 'mora11', 'mora10', 'mora9', 'mora8', 'mora7', 'mora6', 'mora5', 'mora4', 'mora3', 'mora2', 'mora1'] +
[ 'kiwi18', 'kiwi17', 'kiwi16', 'kiwi15', 'kiwi14', 'kiwi13', 'kiwi12', 'kiwi11', 'kiwi10', 'kiwi9', 'kiwi8', 'kiwi7', 'kiwi6', 'kiwi5', 'kiwi4', 'kiwi3', 'kiwi2', 'kiwi1'],
'dias_mora': [ '-1', '-1', '-1', '-1', '-1', '-1', '-1', '4', '6', '0', '8', '9', '7', '10', '3', '2', '3', '2'] +
[ '12', '0', '4', '4', '4', '7', '10', '4', '-6', '-7', '8', '8', '17', '10', '10', '-2', '3', '2']
}
df = pd.DataFrame.from_dict(data) # convert dictionary to dataframe
lob = df['obligacion'].unique().tolist() # distinct list of first columne
ddall = {}
for o in lob: # each ob
ldm = df[df['obligacion']==o]['dias_mora'].tolist() # filter by ob, convert last column to list
all = []
cnt = 0
for i in range(len(ldm)-1): # each element in list
cnt += 1
if ldm[i] != ldm[i+1]: # if last element in this sequence
all.append((ldm[i],cnt)) # append tuple to final list
cnt = 0
else:
all.append((ldm[i+1],cnt+1)) # last element
ddall[o] = [(int(e[0]),e[1]) for e in list(all)]
print(ddall)
输出
{
200000000123: [(-1, 7), (4, 1), (6, 1), (0, 1), (8, 1), (9, 1), (7, 1), (10, 1), (3, 1), (2, 1), (3, 1), (2, 1)],
200000000444: [(12, 1), (0, 1), (4, 3), (7, 1), (10, 1), (4, 1), (-6, 1), (-7, 1), (8, 2), (17, 1), (10, 2), (-2, 1), (3, 1), (2, 1)]
}
- - 更新 - -
根据 Pandas 文档,应该避免迭代数据帧,因为它非常慢。为了加快这个脚本的速度,我将键列转换为列表,压缩它们并遍历 zip 对象。该脚本的运行速度大约是原来的两倍。输出是一样的。
这是更快的脚本:
import pandas as pd
data = {
'obligacion': [200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123] +
[200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444],
'0': [ 'mora18', 'mora17', 'mora16', 'mora15', 'mora14', 'mora13', 'mora12', 'mora11', 'mora10', 'mora9', 'mora8', 'mora7', 'mora6', 'mora5', 'mora4', 'mora3', 'mora2', 'mora1'] +
[ 'kiwi18', 'kiwi17', 'kiwi16', 'kiwi15', 'kiwi14', 'kiwi13', 'kiwi12', 'kiwi11', 'kiwi10', 'kiwi9', 'kiwi8', 'kiwi7', 'kiwi6', 'kiwi5', 'kiwi4', 'kiwi3', 'kiwi2', 'kiwi1'],
'dias_mora': [ '-1', '-1', '-1', '-1', '-1', '-1', '-1', '4', '6', '0', '8', '9', '7', '10', '3', '2', '3', '2'] +
[ '12', '0', '4', '4', '4', '7', '10', '4', '-6', '-7', '8', '8', '17', '10', '10', '-2', '3', '2']
}
df = pd.DataFrame.from_dict(data) # convert dictionary to dataframe
# convert key columns to lists for faster scan
lstob = df['obligacion'].to_list()
lstdm = df['dias_mora'].to_list()
ddall = {}
lastob = "___" # will delete this entry
lst = []
lastv = cnt = 1
tt = zip(lstob, lstdm) # combine lists for iteration
for t in tt: # each ob/dm
if t[0] != lastob: # new ob
lst.append((int(lastv), cnt)) # add last sequence
ddall[lastob] = lst # add list to dictionary
lastob = t[0]
lst = []
lastv = t[1]
cnt = 1
else: # same ob
if t[1] != lastv: # if new dm
lst.append((int(lastv), cnt))
lastv = t[1]
cnt = 1
else:
cnt += 1 # just increment ctr
else: # last row in dataset
lst.append((int(t[1]), cnt))
ddall[lastob] = lst
del ddall['___'] # remove temporary entry
print(ddall)
---- 更新 #2 ----
如果要在输出中添加 dias_mora,可以在计算值时收集 dm 条目。
为此,这是更新的代码:
import pandas as pd
data = {
'obligacion': [200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123, 200000000123] +
[200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444, 200000000444],
'0': [ 'mora18', 'mora17', 'mora16', 'mora15', 'mora14', 'mora13', 'mora12', 'mora11', 'mora10', 'mora9', 'mora8', 'mora7', 'mora6', 'mora5', 'mora4', 'mora3', 'mora2', 'mora1'] +
[ 'kiwi18', 'kiwi17', 'kiwi16', 'kiwi15', 'kiwi14', 'kiwi13', 'kiwi12', 'kiwi11', 'kiwi10', 'kiwi9', 'kiwi8', 'kiwi7', 'kiwi6', 'kiwi5', 'kiwi4', 'kiwi3', 'kiwi2', 'kiwi1'],
'dias_mora': [ '-1', '-1', '-1', '-1', '-1', '-1', '-1', '4', '6', '0', '8', '9', '7', '10', '3', '2', '3', '2'] +
[ '12', '0', '4', '4', '4', '7', '10', '4', '-6', '-7', '8', '8', '17', '10', '10', '-2', '3', '2']
}
df = pd.DataFrame.from_dict(data) # convert dictionary to dataframe
# convert key columns to lists for faster scan
lstob = df['obligacion'].to_list()
lst0 = df['0'].to_list()
lstdm = df['dias_mora'].to_list()
cur0 = ""
ddall = {}
lastob = "___" # will delete this entry
lst = []
lastv = cnt = 1
tt = zip(lstob, lst0, lstdm) # combine lists for iteration
for t in tt: # each ob/dm
if t[0] != lastob: # new ob
lst.append((int(lastv), cnt, cur0)) # add last sequence
ddall[lastob] = lst # add list to dictionary
lastob = t[0]
lst = []
lastv = t[2]
cur0 = t[1]
cnt = 1
else: # same ob
if t[2] != lastv: # if new dm
lst.append((int(lastv), cnt, cur0))
lastv = t[2]
cur0 = t[1]
cnt = 1
else:
cnt += 1 # just increment ctr
cur0 += ',' + t[1]
else: # last row in dataset
lst.append((int(t[2]), cnt, cur0))
ddall[lastob] = lst
del ddall['___'] # remove temporary entry
print(ddall)
输出(格式化)
{
200000000123: [
(-1, 7, 'mora18,mora17,mora16,mora15,mora14,mora13,mora12'),
(4, 1, 'mora11'),
(6, 1, 'mora10'),
(0, 1, 'mora9'),
(8, 1, 'mora8'),
(9, 1, 'mora7'),
(7, 1, 'mora6'),
(10, 1, 'mora5'),
(3, 1, 'mora4'),
(2, 1, 'mora3'),
(3, 1, 'mora2'),
(2, 1, 'mora1')],
200000000444: [
(12, 1, 'kiwi18'),
(0, 1, 'kiwi17'),
(4, 3, 'kiwi16,kiwi15,kiwi14'),
(7, 1, 'kiwi13'),
(10, 1, 'kiwi12'),
(4, 1, 'kiwi11'),
(-6, 1, 'kiwi10'),
(-7, 1, 'kiwi9'),
(8, 2, 'kiwi8,kiwi7'),
(17, 1, 'kiwi6'),
(10, 2, 'kiwi5,kiwi4'),
(-2, 1, 'kiwi3'),
(3, 1, 'kiwi2'),
(2, 1, 'kiwi1')]
}
split
如果需要,可以使用 string 方法将 dm 列表转换为列表。
推荐阅读
- api - 如何将多个 OpenAPI 3 规范文件组合在一起?
- android - 如何使用自定义相机解决在 Android 中自动裁剪图像的问题?
- python - 跨 numpy 数组的多个索引执行相同的自定义操作
- javascript - 在 javascript/jquery 的选择选项值中设置双引号字符串,也可以从 javascript/jquery 获取
- php - 我知道闭包可以用作变量值。但我很困惑哪些被称为这个值
- java - Selenium:带有下拉框的 HtmlUnit UnexpectedTagNameException 错误
- fatal-error - git diff 致命:不明确的参数未知修订版或路径不在工作树中
- c# - 如何正确地将样式应用于 Open Office XML 书签文本
- wordpress - 为什么添加到 WordPress 中禁用的菜单框
- postgresql - 哪些配置可以使 Postgres 在 pgAdmin4 中加载更快?