首页 > 解决方案 > 包含不同数据类型的dict的文件处理(I/O)

问题描述

我想将字典存储在四种不同的文件类型(*.csv, *.json, *.txt, *.dat)中,其中包含几个str, a listof str, a pd.Dataframe, np.ndarrays, np.ndarrays of np.ndarrays 和floats。我尝试了几种可能性,但它们都只起作用了一部分。我无法将 转换dict为 apd.DataFrame因为 dict 的条目有不同的长度。否则,选择的方法将是pd.DataFrame.to_csv/pd.read_csvpd.DataFrame.to_json/ pd.DataFrame.read_json,分别。

读取文件意味着读取先前程序运行的保存状态(数据将变为 a pd.DataFrame)。随机值将在程序运行期间被计算替换(或使用早期保存的配置文件中的值),因此无需担心这些np.random.rand部分。

注释掉的行是我在不同网站上发现的不同可能性(包括官方文档,它没有预期的那么有用)。

写文件

import pandas as pd
import numpy as np
import csv
import os
import json

content = {
    'rootpath' : os.path.abspath ( os.getcwd () ),
    'indir' : '.',
    'outdir' : 'output',
    'workdir' : 'temporary',
    'datafile' : 'file.csv',
    'configfile' : 'configs.txt',
    'savefile' : 'contents.csv',
    'model' : np.arange (4),
    'datafiles' : [os.path.join (os.getcwd (), 'data.csv'), os.path.join (os.getcwd (), 'data.dat')],
    'data' : pd.DataFrame ( np.arange (15).reshape (3, 5) ),
    'dataid' : 'g_g;r_g;r_r;',
    'result' : None,
}

hmm = {
    'hmm_a' : np.random.rand (9).reshape (3, 3), # test input of a two-dimensional (np.nd)array because one isn't enough and they will appear (during/after computation)
    'hmm_b' : np.zeros (3),
    'hmm_mu' : np.random.rand (1),
    'hmm_pi' : np.random.rand (3),
    'hmm_block' : np.random.rand (27).reshape (3, 3, 3), # test input of a three-dimensional (np.nd)array because of their appearance
}

'''
computation, changing pathname and filename, rest of the program
'''

write_in = content.copy ()
write_in.update (hmm)
path = os.path.join ( write_in ['rootpath'], write_in ['outdir'], write_in ['savefile'] )
p, filetype = os.path.splitext (path)

'''
if write_in == write_in ['data'] : # write_in only contains data
    onlydata = True
else :
    onlydata = False
'''

for c in write_in :
    if type ( write_in [c] ) == np.ndarray :
        write_in [c] = write_in [c].tolist ()
    elif type ( write_in [c] ) == pd.DataFrame :
        #write_in [c] = pd.DataFrame.to_numpy ( write_in [c], copy = True ).tolist () # needs pandas +0.24.0
        write_in [c] = write_in [c].values.tolist ()

# saving as *.csv Comma Separated Values
if 'csv' in filetype or 'CSV' in filetype :         # If chosen, take care when loading this file. Remember the data structure…!
    if onlydata :
        with open ( path, mode ) as f :                         # alternative 0; pd.DataFrame
            write_in.to_csv ( f, header = None, index = False ) # alternative 0; pd.DataFrame
       #write_in.to_csv ( path, header = None, index = False )   # alternative 1; pd.DataFrame
    else :                                          ### works.
# converting the chosen variables to linewise objects
        for c in write_in :
            if type ( write_in [c] ) == np.ndarray :
                write_in [c] = write_in [c].tolist ()
            elif type ( write_in [c] ) == pd.DataFrame :
                #write_in [c] = write_in [c].to_numpy (copy = True).tolist ()            # alternative 2
                #write_in [c] = DataFrame.to_numpy (write_in [c], copy = True).tolist () # alternative 1
                write_in [c] = write_in [c].values.tolist ()                             # alternative 0
# the saving itself
        with open ( path, 'w', newline = '' ) as f :
            w = csv.writer ( f, delimiter = ',', quotechar = '"' )#, quoting = csv.QUOTE_ALL ) # alternative 1; dict
            for key, value in write_in.items () :                                                # alternative 1; dict
                w.writerow ( [key, value] )                                                      # alternative 1; dict
# saving as *.json JavaScript Object Notation; the dict should be written into the file as onto the command line.
elif 'json' in filetype or 'JSON' in filetype :
    if onlydata :
        with open ( path, mode ) as f :
            write_in.to_json (f)#, orient = 'columns' )#, orient = DataFrame )#, index = False ) # alternative 0; pd.DataFrame
    else :                                          ### works.
# converting the chosen variables to JSON serializable objects
        for c in write_in :
            if type ( write_in [c] ) == np.ndarray :
                write_in [c] = write_in [c].tolist ()
            elif type ( write_in [c] ) == pd.DataFrame :
                write_in [c] = write_in [c].to_json ( orient = 'columns' )
# the saving itself
        with open ( path, mode ) as f :
            #f.write ( json.dumps (write_in) )      # alternative 1; dict
            json.dump ( write_in, f, indent = 4 )   # alternative 0; dict
# saving as *.txt
elif 'txt' in filetype or 'TXT' in filetype :
    if onlydata :
            with open ( path, mode ) as f :
                f.write ( str (write_in) )          ### Better make this with pd.iterrows
    else :
        with open ( path, mode ) as f :
            f.write ( str (write_in) )
# saving as *.dat; this shouldn't make a difference for the file here as if saved as *.txt.
elif 'dat' in filetype or 'DAT' in filetype :
    if onlydata :
        with open ( path, mode = mode + 'b' ) as f :
            f.write (write_in)                      ### Better make this with pd.iterrows…?
    else :
        with open ( path, mode = mode + 'b' ) as f :
            f.write (write_in)
else :
    print ( 'save_file: Unknown file format. Aborting program part.' )

读取文件

# from *.csv
if 'csv' in filetype or 'CSV' in filetype :
    read_out = {}
    with open ( path, 'r' ) as f :
        reader = csv.reader (f)
        for k, v in reader :
            read_out [k] = v
        #for line in f :                             # if the above fails
           #(key, val) = line.split (',')            # if the above fails
           #read_out [key] = val                     # if the above fails
    #d = {}                                          # if the above fails
    #for key, val in read_out.iterrows () :          # if the above fails
        #d [key] = val                               # if the above fails
# converting the str the their original dtype, determined by what's in the str; better have put it in the file…?
    for a in read_out :
        if read_out [a] == '' :
            if 'dir' in a :                         # Relative paths; set *indir*, *outdir* and *workdir* to *rootpath* if not specified.
                read_out [a] = '.'
            elif a == 'result' :                    # No result was achieved.
                read_out [a] = None
        elif a == 'dataid' :
            if '[' == read_out [a] [0] and ']' == read_out [a] [-1] and "', '" in read_out [a] :
                read_out [a] = read_out [a].split ("', '") [ 1 : -1 ] # Take away the enclosing brackets -> split up the long str to multiple short ones by the separating sequence of a str (list)
            elif "', '" in read_out [a] :
                read_out [a] = read_out [a].split ("', '")
            elif '; ' in read_out [a] :
                read_out [a] = read_out [a].split ('; ')
            elif ';' in read_out [a] :
                read_out [a] = read_out [a].split (';')
            else :
                seppi = input ( "read_configs: Couldn't determine the separating character of *dataid*. Please type it (Standard: comma): " )
                if seppi == '' :
                    seppi = ','
                    if seppi in read_out [a] :
                        read_out [a] = read_out [a].split (seppi)
                    else :
                        read_out [a] = ['g_g', 'r_g', 'r_r']
        elif a == 'datafiles' :                     # input list of data files which weren't taken into computation yet; absolute paths
            read_out [a] = read_out [a].split ("'") [ 1 : -1 : 2 ]
        elif '[' == read_out [a] [0] and ']' == read_out [a] [-1] : # Should be a np.ndarray because other entries with '[' and ']' are already sorted out.
            if read_out [a].count ('[') == read_out [a].count (']') == 1 : # one-dimensional array
                #floats = np.array ( read_out [a] [ 1 : -1 ].split (', ') [:] ).astype (np.float128) # alternative 1
                #ints = np.array ( read_out [a] [ 1 : -1 ].split (', ') [:] ).astype (np.int64)      # alternative 1
                floats = np.fromstring ( read_out [a] [ 1 : -1 ], dtype = np.float128, sep = ',' )  # alternative 0
                ints = np.fromstring ( read_out [a] [ 1 : -1 ], dtype = np.int64, sep = ',' )       # alternative 0
                if ints.all () == floats.all () and not ints.all () == np.zeros (floats.size).all () :
                    read_out [a] = ints
                else :
                    read_out [a] = floats
            else :                                  # multi-dimensional array; actually works only with two-dimensional arrays.
                md_array = read_out [a] [ 2 : -2 ].split ('], [') # Removing '[[' from the beginning and ']]' from the end, splitting it up to a list where the elements are the inner np.ndarrays as str.
                f, i = [], []
                for b in range ( len (md_array) ) : # iterating over the length of *md_array*
                    #floats = np.array ( md_array [b].split (', ') ).astype (np.float128)    # alternative 1
                    #ints = np.array ( md_array [b].split (', ') ).astype (np.int64)         # alternative 1
                    floats = np.fromstring ( md_array [b], dtype = np.float128, sep = ',' ) # alternative 0
                    ints = np.fromstring ( md_array [b], dtype = np.int64, sep = ',' )      # alternative 0
                    f.append (floats)
                    i.append (ints)
                floats = np.array (f)
                ints = np.array (i)
                if ints.all () == floats.all () and not ints.all () == np.zeros (floats.size).all () :
                    read_out [a] = ints
                else :
                    read_out [a] = floats
# from *.json; the dict should be written into the file as onto the command line.
elif 'json' in filetype or 'JSON' in filetype :
    with open ( path, 'r' ) as f :                  # alternative 0
        read_out = json.load (f)                    # alternative 0
        #f.read ( json.dumps (read_out) )            # alternative 1
# converting the str the their original dtype, determined by what's in the str; better have put it in the file…?
    for a in read_out :
        if a == 'result' or a == 'datafiles' or a == 'dataid' : # input list of data files which weren't taken into computation yet; absolute paths
            pass
        elif read_out [a] == '' and 'dir' in a :    # Relative paths; set *indir*, *outdir* and *workdir* to *rootpath* if not specified.
            read_out [a] = '.'
        elif type ( read_out [a] ) == list :
            #read_out [a] = pd.read_json ( read_out [a], numpy = True, precise_float = True )
            read_out [a] = np.asarray ( read_out [a], dtype = np.float128 )
# from *.txt
elif 'txt' in filetype or 'TXT' in filetype :
    with open ( path, 'r' ) as f :
        reading = f.read ()
# from *.dat; this shouldn't make a difference for the file here as if saved as *.txt.
elif 'dat' in filetype or 'DAT' in filetype :
    with open ( path, 'rb' ) as f :
        f.read (read_out)
# Put the variables in the dicts *content* or *hmm*.
for a in read_out :
    if 'hmm_' in a :
        hmm [a] =  read_out [a]
    else :
        content [a] = read_out [a]
if 'data' in content :
    content ['data'] = pd.DataFrame ( content ['data'] )

标签: pythonjsoncsvdictionaryfile-io

解决方案


推荐阅读