首页 > 解决方案 > Best practices, time-wise, for Reading and Writing files in python 3.7

问题描述

I have a fairly large dataset (1440*18364)(row*col) composed of x,y and z coordinates that I am creating for a machine learning algorithm, along with new rows being made at 360 variations per new row creation. Currently i am at 4 datasets, but the orientation of the data (rotation about the z axis) in theory is irrelevant for the output i am trying to capture, thus resulting in 360x the data. This proposes kind of a memory issue, as i need to interpolate every row to get the rotational variation. currently it takes about 2 minutes to process 4 datasets, with only a 1 degree change, therefore 8 rows. this leads me to believe that the total operation will take over 6 hours to process 180x that amount of data. What is the best way to write the data so that i can do this in the most efficient manner, instead of taking 6 hours? And if there is better ways to do this, i am also all ears.

the end goal is to write this to a file so that i can just read the data in every time new data is created instead of processing it every time. I have read about saving the the model as a file, but that would be down the line after i get the dataset into a reasonable form.

if you have any questions, feel free to ask.

EDIT: here is an MRE, you can change the size of the dataset with the POINTS variable. If you would like to change how many degree variations of the dataset, change the linspace in the degrees variable inside mat2consistent to something larger than 2. one other thing i forgot to mention is that the coordinates are bound in a circle, hence the gen radial points function:

from keras.models import Sequential
from keras.layers.core import Dense
from keras.optimizers import Adam
import csv
import scipy.io as sio
import pandas as pd
import numpy as np
import scipy.interpolate
from scipy.spatial.transform import Rotation as R

POINTS = 5

def readCSV(folder, file):
  """
    reads in a 2D-CSV as a dictionary, assumes there are headers
  """
  d = dict()
  d['Time'] = [1, 2, 3, 4]
  print(d)
  return d

def genRadialPoints(r):
  """
    Creates a set of points in a circular pattern where the center is (0,0), and the radius of the circle is specified
    in the input
  """
  rotspace = np.linspace(0,359,num=180)*np.pi/180
  radspace = np.linspace(0,r*0.75,num=35)

  # Grab the first one initially, so the center is not counted multiple times
  xcoord = np.array(np.cos(rotspace[0])*radspace)
  ycoord = np.array(np.sin(rotspace[0])*radspace)



  for i in range(len(rotspace[1:])):
    for j in range(len(radspace[1:])):
      xcoord = np.append(xcoord,np.cos(rotspace[i+1])*radspace[j+1])
      ycoord = np.append(ycoord,np.sin(rotspace[i+1])*radspace[j+1])

  print(xcoord.shape)
  return [xcoord, ycoord]

def mat2consistent(mat_file_location, do_360=False):
    """
    converts a .mat file to a consistent set of points to be used as an input for a neural net
    """
    # mat_contents = sio.loadmat(mat_file_location)
    # x = mat_contents['xl']
    # y = mat_contents['yl']
    # zz = mat_contents['Zrem2']

    x = np.linspace(1,10,POINTS) - 5
    y = np.linspace(1,10,POINTS) - 5

    zz = np.random.random((POINTS,POINTS))

    xx, yy = np.meshgrid(x,y)

    radius = 5
    xcoord, ycoord = genRadialPoints(radius)

    s = xx.shape
    print('xx size',s)
    xx = xx.reshape(s[0]*s[1],1)
    xx = xx[0::5,:]
    print('xx size new',xx.shape)
    s = yy.shape
    print('yy size',s)
    yy = yy.reshape(s[0]*s[1],1)
    yy = yy[0::5,:]
    print('yy size new',yy.shape)
    s = zz.shape
    print('zz size',s)
    zz = zz.reshape(s[0]*s[1],1)
    zz = zz[0::5,:]
    print('zz size new',zz.shape)

    points = np.array([xx[:,0],yy[:,0]]).transpose()
    print('points shape:',points.shape)

    if do_360:
        # degrees = np.linspace(0,359,360)
        degrees = np.linspace(0,1,2)
        r = []
        for d in degrees:
            temp = R.from_euler('zyx', [0, 0, d], degrees=True)
            r.append(temp.as_matrix())
        r = np.array(r)
        # print('r:',r)
        arrs = np.concatenate((xx, yy, zz),axis=1)
        threeSixty = []
        for rot in r:
            temp = np.matmul(rot,arrs.T)
            temp = temp.T
            ind1 = temp[:,0]
            notnanind = np.logical_not(np.isnan(ind1))
            zcoord = scipy.interpolate.griddata(temp[notnanind,0:2],temp[notnanind,2],(xcoord,ycoord), method='linear')
            temp = np.concatenate((xcoord, ycoord, zcoord),axis=0)
            threeSixty.append(temp)
        threeSixty = np.array(threeSixty)
        # print('three sixty shape:', threeSixty.shape)

    return threeSixty

def csv2time(folder, file):
  data = readCSV(folder, file)
  times = data['Time']
  time = times[-1] - times[0]
  return time

def prep_data(matfile_names, csvfile_names):
    outer_layer = []
    labels = []
    for file in matfile_names:
        print('file name is:',file)

        # Prep the data:
        data = mat2consistent(folder + file, do_360=True)
        for i in data:
            print('data size',i.shape)
            outer_layer.append(i)

    input = np.array(outer_layer)

    for csvfile in csvfile_names:
        print('Reading in file:', csvfile)
        # Prep the labels:
        timetemp = csv2time(folder, csvfile)
        for i in range(int(input.shape[0]/len(csvfile_names))):
            labels.append(timetemp)
    # labels = [1, 2]



    labels = np.array(labels)

    print('input shape', input.shape)
    print('label shape', labels.shape)
    return [input, labels]

# def

if __name__  == '__main__':

  folder = 'C:\\Users\\nquattrociocchi\\Documents\\Pool\\'
  matfiles = [
    'one',
    'two',
    'three',
    'four'
  ]

  csvfiles = [
    'one',
    '2',
    '3',
    'four'
  ]

  [input, labels] = prep_data(matfiles, csvfiles)



  model = Sequential([
      Dense(400, input_shape=(int(input.shape[1]),), activation='relu'),
      Dense(200, activation='relu'),
      Dense(75, activation='relu'),
      Dense(28, activation='relu'),
      Dense(10, activation='relu'),
      Dense(1, activation='sigmoid')
  ])

  model.summary()
  model.compile(Adam(lr=0.0001),loss='binary_crossentropy', metrics=['accuracy'])

  # print(model.get_output_shape_at(0))

  # train_labels_one_hot = np.asfarray(train_labels_one_hot)
  model.fit(x = input, y = labels, batch_size=5, epochs=10, shuffle=True, verbose=2)

标签: pythonmachine-learning

解决方案


推荐阅读