首页 > 解决方案 > 在 python 中为 json 数据创建可配置代码

问题描述

我正在尝试创建用于使用 python 将平面文件转换为 json 文件的可配置代码。我有电子邮件列,它是多值的,将来也可以有更多的多值列。因此需要以这种方式开发代码,代码应该适用于任何数据。

在平面文件中输入数据

source_id,fname,lname,email,dob,line1,line2,line3,city,state,country
7,priya,kannan,shanthapriya794@gmail.com,07-12-1994,123,456,67,mdu,tn,india
7,priya,kannan,shanthapriya7964@gmail.com,07-12-1994,123,456,67,mdu,tn,india

输出获取

[{
    "source_id": 7,
    "fname": "priya",
    "lname": "kannan",
    "date_of_birth": "07-12-1994",
    "address": [{
        "line1": 123,
        "line2": 456,
        "line3": 67,
        "city": "mdu",
        "state": "tn",
        "country": "india"
    }, {
        "line1": 123,
        "line2": 456,
        "line3": 67,
        "city": "mdu",
        "state": "tn",
        "country": "india"
    }]
}]

预期产出

[{
    "source_id": 7,
    "fname": "priya",
    "lname": "kannan",
    "date_of_birth": "07-12-1994",
    "email" : ["shanthapriya794@gmail.com","shanthapriya7964@gmail.com"],
    "address": [{
        "line1": 123,
        "line2": 456,
        "line3": 67,
        "city": "mdu",
        "state": "tn",
        "country": "india"
    }]
}]

代码尝试

文件.py

import pandas as pd
import json
from configuration import config

def main():

    path = config['path']['input_file_path']
    reg_col = config['columns']['reg_fields']
    multivalued_fields = config['columns']['multi_value']
    multivalued_fields = list(multivalued_fields.split(","))
    g_cols = list(reg_col.split(","))
    df = pd.read_csv(path, sep=",", header=0)
    cols = df.columns[~df.columns.isin(g_cols)]
    g_cols = [ele for ele in g_cols if ele not in multivalued_fields]

    i=0
    while i < len(multivalued_fields):
        j = multivalued_fields[i]

        df2 = (df.sort_values(g_cols).set_index(g_cols).assign(j=df.groupby(g_cols)[j].agg(lambda x: tuple(pd.unique(x)))).reset_index())
        i = i + 1

    
    df3 = df2.drop_duplicates().groupby(g_cols)[cols].apply(lambda x: x.to_dict('records')).reset_index(name='address').to_dict('record')
    df3 = pd.DataFrame(df3)
    return df3

def writefile_toJson(df):
    df.to_json('outputfiles/jsonstructure1.json', orient='records')

if __name__ == "__main__":
    df = main()
    writefile_toJson(df)

配置文件

[path]
input_file_path = Input_Files/flat_test_file.txt

[columns]
reg_fields = source_id,fname,lname,email,date_of_birth
multi_value = email

配置文件

from configparser import ConfigParser

# Loading configuration details
file = "config.ini"
config = ConfigParser()
config.read(file)

标签: pythonjsonpandas

解决方案


import pandas as pd
import configparser, ast, json
from configuration import config
import ast

def main():

    path = config['path']['input_file_path']
    json_path = config['json_path']['input_json_path']
    g_cols = config['columns']['reg_fields']
    g_cols = list(g_cols.split(","))
    multivalued_fields = config['columns']['multi_value']
    multivalued_fields = list(multivalued_fields.split(","))
    print(multivalued_fields)
    df = pd.read_csv(path, sep=",", header=0)
    with open(json_path, 'r') as f:
        dict_val = json.load(f)
    df.columns = df.columns.to_series().map(dict_val)
    cols= df.columns[~df.columns.isin(g_cols)]
    g_cols = [ele for ele in g_cols if ele not in multivalued_fields]
    df1 = df.groupby(g_cols, as_index=False)[multivalued_fields].agg(lambda x: set(x)).to_dict('records')
    print("df1",df1)
    print("cols", cols)
    print("gcols", g_cols)
    df31 = pd.DataFrame(df1)
    print(df31)
    df2 = df.drop_duplicates().groupby(g_cols)[cols].apply(lambda x: x.to_dict('records')).reset_index(name='address').to_dict('record')
    print(df2)
    my_finallist=[]
    for i in range(0,len(df2)):
        my_finallist =[dict(s) for s in set(frozenset(d.items()) for d in df2[i]["address"])]
        print("my_finallist :",list(my_finallist))
        df2[i]["address"] =my_finallist
    df3 = pd.DataFrame(df2)
    df4= pd.merge(df3,df31)
    print(df3)
    return df4

def writefile_toJson(df):
    df.to_json('outputfiles/jsonstructure1.json', orient='records')

if __name__ == "__main__":
    df = main()
    writefile_toJson(df)

推荐阅读