python - 相同的代码,不同的输出,用于记录比较的 Python 数据框
问题描述
早上好,我的代码的目标是比较多个数据框中的记录。如果记录 ID 存在于另一个数据框中,则输出记录存在的月份。例如,如果 ID 1 存在于三月、四月,则输出三月四月。但是,当我在测试样本上运行此代码时,它可以完美运行。当我在原始数据集上运行它时,所需的输出是全部或全部,而不是单个月份。
下面的代码与作为测试样本的所需输出完美配合。
代码
#Import of Libraries
import pandas as pd
import numpy as np
import xlsxwriter
import nltk
from itertools import chain
data1 = {'aa',1,3,2,12,3,4,5,'bb',6,7,8,9,100,65,56,'cc',70}
data2 = {'aa',11,12,3,4,5,123,12,14,8,'cc',100,56}
data3 = {'aa',12,111,33,13,5,6,4,555,'bb',3333,65,634,7,8,8888,100}
data4 = {'aa',44,33,5,6,7,8,999,'bb',4,2,66,3,70,1,1,2}
df1 = pd.DataFrame(data1,columns=['RPN'])
df2 = pd.DataFrame(data2,columns=['RPN'])
df3 = pd.DataFrame(data3,columns=['RPN'])
df4 = pd.DataFrame(data4,columns=['RPN'])
df1 = df1.astype(str)
df2 = df2.astype(str)
df3 = df3.astype(str)
df4 = df4.astype(str)
#Creates list of Source and RPN to compare data
march = df4['RPN'].values.tolist()
april = df3['RPN'].values.tolist()
may = df2['RPN'].values.tolist()
june = df1['RPN'].values.tolist()
#turns list of each month into sets.
june = set(june)
may = set(may)
april = set(april)
march = set(march)
#creates list of every record in all months
setlist = [june,may,april,march]
#creats an interestion of all like values in the list of months
setall = set.intersection(*setlist)
setall
#Checks to see if current dataframe RPN and Source is in the previous audit report data
compare = []
for index,x in df1.iterrows():
RPN = x['RPN']
if RPN in setall:
compare.append('All Months')
elif RPN not in chain(setall, april, may) and RPN in march:
compare.append('March')
elif RPN not in chain(setall, march, may) and RPN in april:
compare.append('April')
elif RPN not in chain(setall, march, april) and RPN in may:
compare.append('May')
elif RPN not in chain(setall,march) and RPN in may and april:
compare.append('April and May')
elif RPN not in chain(setall,april) and RPN in may and march:
compare.append('March and May')
elif RPN not in chain(setall,may) and RPN in april and march:
compare.append('March and April')
else:
compare.append('New Record')
df1['Aging'] = compare
df1
正确输出
RPN Aging
0 1 March
1 2 March
2 3 March and May
3 4 All Months
4 5 All Months
5 bb March and April
6 6 March and April
7 7 March and April
8 8 All Months
9 9 New Record
10 100 April and May
11 12 April and May
12 65 April
13 cc May
14 70 March
15 aa All Months
16 56 May
我遇到的问题是,当我将完全相同的代码和格式引入原始数据集时,会出现 ALL 或 NOTHING 结果,而不是显示每条记录之间的差异。
#Import of Libraries
import pandas as pd
import numpy as np
import xlsxwriter
import nltk
from itertools import chain
#Creates dataframes
#Current Month
bucket='sagemaker-bucket-826404949026/Provider Data/Audit Comparison'
data_key = 'AuditJune2019.xlsx'
data_location = 's3://{}/{}'.format(bucket, data_key)
df = pd.read_excel(data_location)
df.info()
#Previous Month
bucket2 ='sagemaker-bucket-826404949026/Provider Data/Audit Comparison'
data_key2 = 'AuditMay2019.xlsx'
data_location2 = 's3://{}/{}'.format(bucket2, data_key2)
dfprev2 = pd.read_excel(data_location2)
dfprev2.info()
#April Month
bucket3 ='sagemaker-bucket-826404949026/Provider Data/Audit Comparison'
data_key3 = 'AuditApril2019.xlsx'
data_location3 = 's3://{}/{}'.format(bucket3, data_key2)
dfprev3 = pd.read_excel(data_location3)
dfprev3.info()
#March Month
bucket4 ='sagemaker-bucket-826404949026/Provider Data/Audit Comparison'
data_key4 = 'AuditMarch2019.xlsx'
data_location4 = 's3://{}/{}'.format(bucket4, data_key2)
dfprev4 = pd.read_excel(data_location4)
dfprev4.info()
#Creates list of Source and RPN to compare data
dfprev4 = dfprev4.fillna('0')
dfprev3 = dfprev3.fillna('0')
dfprev2 = dfprev2.fillna('0')
df = df.fillna('0')
df = df.astype(str)
dfprev2 = dfprev2.astype(str)
dfprev3 = dfprev3.astype(str)
dfprev4 = dfprev4.astype(str)
dfprev4['RPN'] = dfprev4['RPN'] + dfprev4['SOURCE']
dfprev3['RPN'] = dfprev3['RPN'] + dfprev3['SOURCE']
dfprev2['RPN'] = dfprev2['RPN'] + dfprev2['SOURCE']
df['RPN'] = df['RPN'] + df['SOURCE']
#Creates list of Source and RPN to compare data
march = dfprev4['RPN'].values.tolist()
april = dfprev3['RPN'].values.tolist()
may = dfprev2['RPN'].values.tolist()
june = df['RPN'].values.tolist()
#turns list of each month into sets.
june = set(june)
may = set(may)
april = set(april)
march = set(march)
#creates list of every record in all months
setlist = [june,may,april,march]
#creats an interestion of all like values in the list of months
setall = set.intersection(*setlist)
setall
#creates a dataframe of just RPN
df1 = pd.DataFrame(df['RPN'],columns = ['RPN'])
#Checks to see if current dataframe RPN and Source is in the previous audit report data
compare = []
for index,x in df1.iterrows():
RPN = x['RPN']
if RPN in setall:
compare.append('All Months')
elif RPN not in chain(setall, april, may) and RPN in march:
compare.append('March')
elif RPN not in chain(setall, march, may) and RPN in april:
compare.append('April')
elif RPN not in chain(setall, march, april) and RPN in may:
compare.append('May')
elif RPN not in chain(setall,march) and RPN in may and april:
compare.append('April and May')
elif RPN not in chain(setall,april) and RPN in may and march:
compare.append('March and May')
elif RPN not in chain(setall,may) and RPN in april and march:
compare.append('March and April')
else:
compare.append('New Record')
df1['Aging'] = compare
df1
不正确的输出
RPN Aging
0 testPORTICO New Record
1 test123PORTICO New Record
2 AG50001PORTICO New Record
3 AG50001FACETS New Record
4 0370001PORTICO New Record
5 0370001FACETS New Record
6 JY00001PORTICO New Record
7 JY00001FACETS New Record
8 JQ00001PORTICO New Record
9 JQ00001FACETS New Record
10 DH70001PORTICO All Months
11 DH70001FACETS All Months
12 8120001PORTICO All Months
13 8120001FACETS All Months
14 J760001PORTICO All Months
15 J760001FACETS All Months
16 MS200012PORTICO All Months
17 MS200012FACETS All Months
18 MS200012FACETS All Months
19 BZ400013PORTICO All Months
20 BZ400013FACETS All Months
我在想这可能是导入数据框的问题?我不确定请帮忙!
解决方案
考虑对您的代码进行以下修改。四个print
s 允许您直观地检查数据(以表格形式),所以我相信可能会对您有所帮助。
import pandas as pd
# dictionary of sets
data = {
'June':
{'aa', 1, 3, 2, 12, 3, 4, 5, 'bb', 6, 7, 8, 9, 100, 65, 56, 'cc', 70},
'May':
{'aa', 11, 12, 3, 4, 5, 123, 12, 14, 8, 'cc', 100, 56},
'April':
{'aa', 12, 111, 33, 13, 5, 6, 4, 555, 'bb', 3333, 65, 634, 7, 8, 8888, 100},
'March':
{'aa', 44, 33, 5, 6, 7, 8, 999, 'bb', 4, 2, 66, 3, 70, 1, 1, 2}}
# print(*(f'{key:>5}: {val}' for key, val in data.items()), sep='\n')
# dictionary with months only
months = {month: 0 for month in data}
# transform dictionary of sets into dictionary of dictionaries
new_data = dict()
for month in data:
for value in data[month]:
new_data.setdefault(str(value), months.copy())[month] = 1
# print(*(f'{key:>5}: {val}' for key, val in new_data.items()), sep='\n')
# create dataframe
df = pd.DataFrame.from_dict(new_data, orient='index')
df.index.name = 'RPN'
# print(df)
def compare(srs):
if srs.sum() == len(months):
return 'All months'
else:
return ', '.join(month
for month, check
in zip(srs.index, srs)
if check)
# add aging string
df['Aging'] = df.apply(compare, axis=1)
# print(df)
编辑
假设您的数据是以数据框的形式从带有 pandas 的文件中加载的,那么您可以data
像这样创建:
# kinda loaded data
a = pd.DataFrame({'RPN': ['aa', 1, 3, 2, 12, 3, 4, 5, 'bb', 6, 7, 8, 9, 100, 65, 56, 'cc', 70]})
b = pd.DataFrame({'RPN': ['aa', 11, 12, 3, 4, 5, 123, 12, 14, 8, 'cc', 100, 56]})
c = pd.DataFrame({'RPN': ['aa', 12, 111, 33, 13, 5, 6, 4, 555, 'bb', 3333, 65, 634, 7, 8, 8888, 100]})
d = pd.DataFrame({'RPN': ['aa', 44, 33, 5, 6, 7, 8, 999, 'bb', 4, 2, 66, 3, 70, 1, 1, 2]})
# dictionary of sets
data = {'June': a['RPN'], 'May': b['RPN'],
'April': c['RPN'], 'March': d['RPN']}
推荐阅读
- postgresql - 使用西班牙语词典 to_tsquery 的 PostgreSQL 全文搜索在某些单词中不起作用
- java - Spring WebClient,并行调用同一服务,收集成功和错误以进行响应
- reactjs - 如何根据角色在反应路由器v5中设置默认登录页面
- javascript - Javascript验证器表单通用?
- javascript - Bootstrap 5 警报未显示
- node.js - 如何在 Dust.js 中单独保留未定义的字段?
- python - 在 macOS 上使用 YOLO v3:OpenCV 不编译
- javascript - 来自托管 URL 的 Firebase UI
- django - postgreSQL如何在数据库中构造Django Array Fields
- java - 无法安装链码,状态为:500