首页 > 解决方案 > 在 python 中处理我的数据集时出现内存错误?可能是什么原因?

问题描述

我正在尝试使用深度学习代码来处理包含 1,12,120 张图像的数据集。我的代码的作用如下:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import imageio
from os import listdir
import skimage.transform
import pickle
import sys, os
from sklearn.preprocessing import MultiLabelBinarizer

def get_labels(pic_id):
    labels = meta_data.loc[meta_data["Image Index"]==pic_id,"Finding Labels"]
    return labels.tolist()[0].split("|")
#Loading Data
meta_data = pd.read_csv(data_entry_path)
bbox_list = pd.read_csv(bbox_list_path)
with open(train_txt_path, "r") as f:
train_list = [ i.strip() for i in f.readlines()]
with open(valid_txt_path, "r") as f:
    valid_list = [ i.strip() for i in f.readlines()]
    label_eight = list(np.unique(bbox_list["Finding Label"])) + ["No Finding"]

# transform training images
print("training example:",len(train_list))
print("take care of your RAM here !!!")
train_X = []
for i in range(len(train_list)):
    image_path = os.path.join(image_folder_path,train_list[i])
    img = imageio.imread(image_path)
    if img.shape != (1024,1024): # there some image with shape (1024,1024,4) in training set
        img = img[:,:,0]
        img_resized = skimage.transform.resize(img,(256,256)) # or use img[::4] here
        train_X.append((np.array(img_resized)/255).reshape(256,256,1))
        if i % 3000==0:
            print(i)
train_X = np.array(train_X)
np.save(os.path.join(data_path,"train_X_small.npy"), train_X)

# transform validation images
print("validation example:",len(valid_list))
valid_X = []
for i in range(len(valid_list)):
    image_path = os.path.join(image_folder_path,valid_list[i])
    img = imageio.imread(image_path)
    if img.shape != (1024,1024):
        img = img[:,:,0]
        img_resized = skimage.transform.resize(img,(256,256))
#     if img.shape != (1024,1024):
#             train_X.append(img[:,:,0])
#     else:
    valid_X.append((np.array(img_resized)/255).reshape(256,256,1))
    if i % 3000==0:
        print(i)

valid_X = np.array(valid_X)
np.save(os.path.join(data_path,"valid_X_small.npy"), valid_X)


# process label
print("label preprocessing")

train_y = []
for train_id in train_list:
    train_y.append(get_labels(train_id))
valid_y = []
for valid_id in valid_list:
    valid_y.append(get_labels(valid_id))


encoder = MultiLabelBinarizer()
encoder.fit(train_y+valid_y)
train_y_onehot = encoder.transform(train_y)
valid_y_onehot = encoder.transform(valid_y)
train_y_onehot = np.delete(train_y_onehot, [2,3,5,6,7,10,12],1) # delete out 8 and "No Finding" column
valid_y_onehot = np.delete(valid_y_onehot, [2,3,5,6,7,10,12],1) # delete out 8 and "No Finding" column

with open(data_path + "/train_y_onehot.pkl","wb") as f:
    pickle.dump(train_y_onehot, f)
with open(data_path + "/valid_y_onehot.pkl","wb") as f:
    pickle.dump(valid_y_onehot, f)
with open(data_path + "/label_encoder.pkl","wb") as f:
    pickle.dump(encoder, f)

所以这是我的代码 我的系统配置:Intel i7-7700HQ,16GB Ram,256GB ssd,GTX 1050 4GB

有没有办法拆分我的数据集并再次写入同一个文件?我还发布了我在执行代码 30 分钟后作为屏幕截图 Error From Powershell的错误

我还在我的系统 64 位版本中使用 python3

拆分 1,12,120 张图像并将它们作为批处理是否可以在这里工作?如果是怎么办?

标签: pythonpython-3.xnumpydeep-learningsklearn-pandas

解决方案


推荐阅读