python - Best practices, time-wise, for Reading and Writing files in python 3.7
问题描述
I have a fairly large dataset (1440*18364)(row*col) composed of x,y and z coordinates that I am creating for a machine learning algorithm, along with new rows being made at 360 variations per new row creation. Currently i am at 4 datasets, but the orientation of the data (rotation about the z axis) in theory is irrelevant for the output i am trying to capture, thus resulting in 360x the data. This proposes kind of a memory issue, as i need to interpolate every row to get the rotational variation. currently it takes about 2 minutes to process 4 datasets, with only a 1 degree change, therefore 8 rows. this leads me to believe that the total operation will take over 6 hours to process 180x that amount of data. What is the best way to write the data so that i can do this in the most efficient manner, instead of taking 6 hours? And if there is better ways to do this, i am also all ears.
the end goal is to write this to a file so that i can just read the data in every time new data is created instead of processing it every time. I have read about saving the the model as a file, but that would be down the line after i get the dataset into a reasonable form.
if you have any questions, feel free to ask.
EDIT: here is an MRE, you can change the size of the dataset with the POINTS variable. If you would like to change how many degree variations of the dataset, change the linspace in the degrees variable inside mat2consistent to something larger than 2. one other thing i forgot to mention is that the coordinates are bound in a circle, hence the gen radial points function:
from keras.models import Sequential
from keras.layers.core import Dense
from keras.optimizers import Adam
import csv
import scipy.io as sio
import pandas as pd
import numpy as np
import scipy.interpolate
from scipy.spatial.transform import Rotation as R
POINTS = 5
def readCSV(folder, file):
"""
reads in a 2D-CSV as a dictionary, assumes there are headers
"""
d = dict()
d['Time'] = [1, 2, 3, 4]
print(d)
return d
def genRadialPoints(r):
"""
Creates a set of points in a circular pattern where the center is (0,0), and the radius of the circle is specified
in the input
"""
rotspace = np.linspace(0,359,num=180)*np.pi/180
radspace = np.linspace(0,r*0.75,num=35)
# Grab the first one initially, so the center is not counted multiple times
xcoord = np.array(np.cos(rotspace[0])*radspace)
ycoord = np.array(np.sin(rotspace[0])*radspace)
for i in range(len(rotspace[1:])):
for j in range(len(radspace[1:])):
xcoord = np.append(xcoord,np.cos(rotspace[i+1])*radspace[j+1])
ycoord = np.append(ycoord,np.sin(rotspace[i+1])*radspace[j+1])
print(xcoord.shape)
return [xcoord, ycoord]
def mat2consistent(mat_file_location, do_360=False):
"""
converts a .mat file to a consistent set of points to be used as an input for a neural net
"""
# mat_contents = sio.loadmat(mat_file_location)
# x = mat_contents['xl']
# y = mat_contents['yl']
# zz = mat_contents['Zrem2']
x = np.linspace(1,10,POINTS) - 5
y = np.linspace(1,10,POINTS) - 5
zz = np.random.random((POINTS,POINTS))
xx, yy = np.meshgrid(x,y)
radius = 5
xcoord, ycoord = genRadialPoints(radius)
s = xx.shape
print('xx size',s)
xx = xx.reshape(s[0]*s[1],1)
xx = xx[0::5,:]
print('xx size new',xx.shape)
s = yy.shape
print('yy size',s)
yy = yy.reshape(s[0]*s[1],1)
yy = yy[0::5,:]
print('yy size new',yy.shape)
s = zz.shape
print('zz size',s)
zz = zz.reshape(s[0]*s[1],1)
zz = zz[0::5,:]
print('zz size new',zz.shape)
points = np.array([xx[:,0],yy[:,0]]).transpose()
print('points shape:',points.shape)
if do_360:
# degrees = np.linspace(0,359,360)
degrees = np.linspace(0,1,2)
r = []
for d in degrees:
temp = R.from_euler('zyx', [0, 0, d], degrees=True)
r.append(temp.as_matrix())
r = np.array(r)
# print('r:',r)
arrs = np.concatenate((xx, yy, zz),axis=1)
threeSixty = []
for rot in r:
temp = np.matmul(rot,arrs.T)
temp = temp.T
ind1 = temp[:,0]
notnanind = np.logical_not(np.isnan(ind1))
zcoord = scipy.interpolate.griddata(temp[notnanind,0:2],temp[notnanind,2],(xcoord,ycoord), method='linear')
temp = np.concatenate((xcoord, ycoord, zcoord),axis=0)
threeSixty.append(temp)
threeSixty = np.array(threeSixty)
# print('three sixty shape:', threeSixty.shape)
return threeSixty
def csv2time(folder, file):
data = readCSV(folder, file)
times = data['Time']
time = times[-1] - times[0]
return time
def prep_data(matfile_names, csvfile_names):
outer_layer = []
labels = []
for file in matfile_names:
print('file name is:',file)
# Prep the data:
data = mat2consistent(folder + file, do_360=True)
for i in data:
print('data size',i.shape)
outer_layer.append(i)
input = np.array(outer_layer)
for csvfile in csvfile_names:
print('Reading in file:', csvfile)
# Prep the labels:
timetemp = csv2time(folder, csvfile)
for i in range(int(input.shape[0]/len(csvfile_names))):
labels.append(timetemp)
# labels = [1, 2]
labels = np.array(labels)
print('input shape', input.shape)
print('label shape', labels.shape)
return [input, labels]
# def
if __name__ == '__main__':
folder = 'C:\\Users\\nquattrociocchi\\Documents\\Pool\\'
matfiles = [
'one',
'two',
'three',
'four'
]
csvfiles = [
'one',
'2',
'3',
'four'
]
[input, labels] = prep_data(matfiles, csvfiles)
model = Sequential([
Dense(400, input_shape=(int(input.shape[1]),), activation='relu'),
Dense(200, activation='relu'),
Dense(75, activation='relu'),
Dense(28, activation='relu'),
Dense(10, activation='relu'),
Dense(1, activation='sigmoid')
])
model.summary()
model.compile(Adam(lr=0.0001),loss='binary_crossentropy', metrics=['accuracy'])
# print(model.get_output_shape_at(0))
# train_labels_one_hot = np.asfarray(train_labels_one_hot)
model.fit(x = input, y = labels, batch_size=5, epochs=10, shuffle=True, verbose=2)
解决方案
推荐阅读
- c# - 在 WinDBG/SOS.DLL 中:如何一次将 !DumpDomain 列出的所有模块/程序集保存到磁盘?
- javascript - 如何添加两个javascript对象
- python - 使用集合论在 Python 中查找形状区域
- unity3d - 为什么场景摄像机在播放时会改变位置和旋转
- html - 在 Shiny 中使用 renderDataTable 扩展列的宽度
- reactjs - React / Redux为什么不是按调用顺序触发的thunk
- python - 正则表达式 - 计算最大数量的短串联重复
- tensorflow - 解码器不接受双向编码器的输出
- r - 使用 purrr map 跨多个子集执行统计测试
- r - 在 r 中写出 .dat 文件