python - 读取数据并转换为具有某些特定格式的 json 格式
问题描述
我正在读取 csv 文件并将其转换为 json 文件,但我需要从数据中创建“地址”字段作为数组字段。我已经为它编写了代码,但无法将其作为数组。所有与地址相关的字段都在地址列下,地址列是一个数组字段。
输入文件
"source_id"|"first_name"|"last_name"|"address_type"|"address_line_1"|"city"
"41614335"|Reinaldo|Tonkoski Jr.|Primary|Deh 211 Box 2222|Brookings|
"41614335"|Reinaldo|Tonkoski Jr.|home|"2409 10th St Apt 123"|Brookings
"07605348"|E|Christodoulou|Primary|"4D Ag Lavras st"|Kifissia
"07605348"|E|Christodoulou|home|"131 N Hamilton Dr Apt 308"|Beverly Hills
输出获取
[
{
"source_id":7605348,
"first_name":"E",
"last_name":"Christodoulou",
"parsed_address":[
{
"address_type":"Primary",
"address_line_1":"4D Ag Lavras st",
"city":"Kifissia"
}
]
},
{
"source_id":7605348,
"first_name":"E",
"last_name":"Christodoulou",
"parsed_address":[
{
"address_type":"home",
"address_line_1":"131 N Hamilton Dr Apt 308",
"city":"Beverly Hills"
}
]
}
]
预期输出:
[
{
"source_id":7605348,
"first_name":"E",
"last_name":"Christodoulou",
"parsed_address":[
{
"address_type":"Primary",
"address_line_1":"4D Ag Lavras st",
"city":"Kifissia"
},
{
"address_type":"home",
"address_line_1":"131 N Hamilton Dr Apt 308",
"city":"Beverly Hills"
}
]
},
{
"source_id":41614335,
"first_name":"Reinaldo",
"last_name":"Tonkoski Jr.",
"parsed_address":[
{
"address_type":"Primary",
"address_line_1":"Deh 211 Box 2222",
"city":"Kifissia"
},
{
"address_type":"home",
"address_line_1":"2409 10th St Apt 123",
"city":"Beverly Hills"
}
]
}
]
代码尝试
df = pd.read_csv("file")
g_cols = ['source_id', 'first_name', 'last_name']
cols = ['address_type', 'address_line_1', 'city']
# Handling Address fields
df2 = df.drop_duplicates().groupby(g_cols)[cols].apply(lambda x: x.to_dict('records')).reset_index(
name="parsed_address").to_dict('record')
# Removing duplicate entry in address field
for i in range(0, len(df2)):
final_list = [dict(s) for s in set(frozenset(d.items()) for d in df2[i]["parsed_address"])]
df2[i]["parsed_address"] = final_list
# Convert resultant list to pandas dataframe
df22 = pd.DataFrame(df2)
print(df22)
解决方案
dic_address =[]
for source, group in df.groupby(by=["source_id"]):
address_dic = {}
address_dic["source_id"] = source
address_dic["address"] = group.drop(columns=["source_id", "first_name", "last_name").to_dict("record")
dic_address.append(address_dic)
print(dic_address)
df_add = pd.DataFrame(dic_address)
listval = ['first_name', 'last_name']
df_source_group = df.drop_duplicates().groupby("source_id")[listval].agg(
lambda x: ','.join(set(x))).reset_index().to_dict('record')
df22 = pd.DataFrame(df_source_group)
df_merge = pd.merge(df_add,df22)