首页 > 解决方案 > 如何让 Tensorflow 使用更多的 RAM?

问题描述

我正在尝试使用 tensorflow/keras 构建 GAN。我有 32GB 的 RAM,我什至在上面添加了另外 70GB 的虚拟内存。当我尝试使用(128、128、3)的大约 40000 张图像时,张量流停止并且我得到:

2021-01-22 00:12:42.680822: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 9232121856 exceeds 10% of free system memory.

但是,它使用的最大 RAM 量是我的理论 100GB 中的 25GB。即使 tensorflow 不能使用虚拟内存,它也只使用了我 80% 的记忆棒。更改批量大小和可训练参数的数量没有任何作用。它是在谈论 VRAM 吗?难道我做错了什么?如何让 tensorflow 使用更多的 RAM?
我目前正在使用:
python 3.8
tf-gpu 2.4.0rc1
keras 2.4.3\

编辑:我尝试将图像的分辨率降低到 (64, 64, 3),这让我可以使用 106000 的整个数据集。这次它进行了训练,但仍然给出了 10% 的内存警告,但它只使用了我的 70% 的内存棒。

完整代码:

import imageio, os, PIL, random, pickle
from glob import glob
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras import layers
import time
from cv2 import cv2
from IPython import display

config = tf.compat.v1.ConfigProto(gpu_options = tf.compat.v1.GPUOptions(
        per_process_gpu_memory_fraction=0.725))
for device in tf.config.experimental.list_physical_devices("GPU"):
    tf.config.experimental.set_memory_growth(device, True)
session = tf.compat.v1.Session(config=config)
tf.compat.v1.keras.backend.set_session(session)

IMG_SIZE = 128
BATCH_SIZE = 64
EPOCHS = 500000
noise_dim = 100
num_examples_to_generate = 96

paths = glob('F:/DATA/E621_GAN/**', recursive=True)
del paths[0]

print(int((len(paths)) / BATCH_SIZE), "STEPS")


def make_generator_model():
    model = tf.keras.Sequential()
    model.add(layers.Dense(int(IMG_SIZE/8)*int(IMG_SIZE/8)*256, use_bias=False, input_shape=(noise_dim,)))
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())

    model.add(layers.Reshape((int(IMG_SIZE/8), int(IMG_SIZE/8), 256)))
    assert model.output_shape == (None, int(IMG_SIZE/8), int(IMG_SIZE/8), 256) # Note: None is the batch size

    model.add(layers.Conv2DTranspose(256, (7, 7), strides=(1, 1), padding='same', use_bias=False))
    assert model.output_shape == (None, int(IMG_SIZE/8), int(IMG_SIZE/8), 256)
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())

    model.add(layers.Conv2DTranspose(128, (7, 7), strides=(2, 2), padding='same', use_bias=False))
    assert model.output_shape == (None, int(IMG_SIZE/4), int(IMG_SIZE/4), 128)
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())

    model.add(layers.Conv2DTranspose(64, (7, 7), strides=(2, 2), padding='same', use_bias=False))
    assert model.output_shape == (None, int(IMG_SIZE/2), int(IMG_SIZE/2), 64)
    model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())

    model.add(layers.Conv2DTranspose(3, (5, 5), strides=(2, 2), padding='same', use_bias=False, activation='tanh'))
    assert model.output_shape == (None, IMG_SIZE, IMG_SIZE, 3)

    model.summary()
    return model

generator = make_generator_model()

noise = tf.random.normal([1, noise_dim])
generated_image = generator(noise, training=False)

def make_discriminator_model():
    model = tf.keras.Sequential()
    model.add(layers.Conv2D(32, (7, 7), strides=(2, 2), padding='same', input_shape=(IMG_SIZE, IMG_SIZE, 3)))
    model.add(layers.LeakyReLU())
    model.add(layers.Dropout(0.25))

    model.add(layers.Conv2D(64, (5, 5), strides=(2, 2), padding='same'))
    model.add(layers.LeakyReLU())
    model.add(layers.Dropout(0.25))

    model.add(layers.Conv2D(128, (5, 5), strides=(2, 2), padding='same'))
    model.add(layers.LeakyReLU())
    model.add(layers.Dropout(0.25))

    model.add(layers.Flatten())
    model.add(layers.Dense(1))

    model.summary()
    return model

discriminator = make_discriminator_model()
decision = discriminator(generated_image)

# This method returns a helper function to compute cross entropy loss
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)

def discriminator_loss(real_output, fake_output):
    real_loss = cross_entropy(tf.ones_like(real_output), real_output)
    fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)
    total_loss = real_loss + fake_loss
    return total_loss


def generator_loss(fake_output):
    return cross_entropy(tf.ones_like(fake_output), fake_output)

generator_optimizer = tf.keras.optimizers.Adam(1e-4)
discriminator_optimizer = tf.keras.optimizers.Adam(1e-4)

checkpoint_dir = 'D:/DATA/E621_SAVES/checkpoints/'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,
                                 discriminator_optimizer=discriminator_optimizer,
                                 generator=generator,
                                 discriminator=discriminator)

checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

seed = tf.random.normal([num_examples_to_generate, noise_dim])

@tf.function
def train_step(images):
    noise = tf.random.normal([BATCH_SIZE, noise_dim])

    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_images = generator(noise, training=True)

        real_output = discriminator(images, training=True)
        fake_output = discriminator(generated_images, training=True)

        gen_loss = generator_loss(fake_output)
        disc_loss = discriminator_loss(real_output, fake_output)

    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)

    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))
    return gen_loss, disc_loss

print("Loading Dataset")
images = []
# 1 GB of images = about 50 GB of data
for i in range(len(paths)-60000):
    try:
        image_path = paths[i]
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = np.float32(image)
        images.append((image - 127.5) / 127.5)
    except:
        pass
print("Finished Loading Dataset")

images = np.array(images)
print(images.shape)
train_dataset = tf.data.Dataset.from_tensor_slices(images).shuffle(99999999).batch(BATCH_SIZE)
print("Finished Organizing Dataset")

epoch_save = 0
try:
    data = [epoch_save, seed]
    with open('D:/DATA/E621_SAVES/checkpoints/epoch.pkl', 'rb') as save:
        data = pickle.load(save)
        epoch_save = data[0]
        seed = data[1]
except:
    print("No pickle found")

def train(dataset, epochs, epoch_save, seed):
    for epoch in range(epochs - epoch_save):
        start = time.time()
        epoch_save += 1

        gen_loss_list, disc_loss_list = [], []
        for image_batch in dataset:
            t = train_step(image_batch)
            gen_loss_list.append(t[0])
            disc_loss_list.append(t[1])
        

        g_loss = sum(gen_loss_list) / len(gen_loss_list)
        d_loss = sum(disc_loss_list) / len(disc_loss_list)

        display.clear_output(wait=True)
        generate_and_save_images(generator,
                             epoch_save,
                             seed)

        if (epoch + 1) % 25 == 0:
            checkpoint.save(file_prefix = checkpoint_prefix)
            data = [epoch_save, seed]
            with open('D:/DATA/E621_SAVES/checkpoints/epoch.pkl', 'wb') as save:
                pickle.dump(data, save)


    
        print(f'Epoch {epoch_save}: gen_loss={g_loss}, disc_loss={d_loss}, time: {np.round(time.time()-start, 4)} sec')

fig = plt.figure(figsize=(15.36,10.24))
def generate_and_save_images(model, epoch, test_input):
    predictions = model(test_input, training=False)

    ax = []
    for i in range(predictions.shape[0]):
        ax.append(plt.subplot(8,12,i+1))
        plt.imshow(predictions[i, :, :, 0] * 127.5 + 127.5)

    for a in ax:
        a.set_xticks([])
        a.set_yticks([])

    fig.subplots_adjust(top = 1, bottom = 0, right = 1, left = 0, wspace=0, hspace=0)

    plt.savefig('D:/DATA/E621_SAVES/epoch_{:04d}.png'.format(epoch), dpi=100)
    plt.clf()

train(train_dataset, EPOCHS, epoch_save, seed)

输出:

2021-01-22 00:12:03.372768: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudart64_110.dll
2021-01-22 00:12:05.743931: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-01-22 00:12:05.744894: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library nvcuda.dll
2021-01-22 00:12:05.765514: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce RTX 3070 computeCapability: 8.6
coreClock: 1.725GHz coreCount: 46 deviceMemorySize: 8.00GiB deviceMemoryBandwidth: 417.29GiB/s
2021-01-22 00:12:05.765726: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudart64_110.dll
2021-01-22 00:12:05.775582: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublas64_11.dll
2021-01-22 00:12:05.775689: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublasLt64_11.dll
2021-01-22 00:12:05.778527: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cufft64_10.dll
2021-01-22 00:12:05.779685: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library curand64_10.dll
2021-01-22 00:12:05.785534: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cusolver64_10.dll
2021-01-22 00:12:05.787678: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cusparse64_11.dll
2021-01-22 00:12:05.788518: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudnn64_8.dll
2021-01-22 00:12:05.788665: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1862] Adding visible gpu devices: 0
2021-01-22 00:12:05.789173: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-01-22 00:12:05.790050: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce RTX 3070 computeCapability: 8.6
coreClock: 1.725GHz coreCount: 46 deviceMemorySize: 8.00GiB deviceMemoryBandwidth: 417.29GiB/s
2021-01-22 00:12:05.790231: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudart64_110.dll
2021-01-22 00:12:05.790305: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublas64_11.dll
2021-01-22 00:12:05.790383: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublasLt64_11.dll
2021-01-22 00:12:05.790582: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cufft64_10.dll
2021-01-22 00:12:05.790704: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library curand64_10.dll
2021-01-22 00:12:05.790816: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cusolver64_10.dll
2021-01-22 00:12:05.790887: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cusparse64_11.dll
2021-01-22 00:12:05.790954: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudnn64_8.dll
2021-01-22 00:12:05.791139: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1862] Adding visible gpu devices: 0
2021-01-22 00:12:06.170269: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1261] Device interconnect StreamExecutor with strength 1 edge matrix:
2021-01-22 00:12:06.170365: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1267]      0 
2021-01-22 00:12:06.170489: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1280] 0:   N 
2021-01-22 00:12:06.170738: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1406] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 5939 MB memory) -> physical GPU (device: 0, name: GeForce RTX 3070, pci bus id: 0000:01:00.0, compute capability: 8.6)
2021-01-22 00:12:06.171371: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set
WARNING:tensorflow:From f:\PYTHON\GAN\GAN.py:16: The name tf.keras.backend.set_session is deprecated. Please use tf.compat.v1.keras.backend.set_session instead.

1671 STEPS
2021-01-22 00:12:08.495267: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce RTX 3070 computeCapability: 8.6
coreClock: 1.725GHz coreCount: 46 deviceMemorySize: 8.00GiB deviceMemoryBandwidth: 417.29GiB/s
2021-01-22 00:12:08.495435: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudart64_110.dll
2021-01-22 00:12:08.495586: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublas64_11.dll
2021-01-22 00:12:08.495694: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublasLt64_11.dll
2021-01-22 00:12:08.495766: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cufft64_10.dll
2021-01-22 00:12:08.495835: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library curand64_10.dll
2021-01-22 00:12:08.495905: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cusolver64_10.dll
2021-01-22 00:12:08.495982: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cusparse64_11.dll
2021-01-22 00:12:08.496054: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudnn64_8.dll
2021-01-22 00:12:08.496143: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1862] Adding visible gpu devices: 0
2021-01-22 00:12:08.496223: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1261] Device interconnect StreamExecutor with strength 1 edge matrix:
2021-01-22 00:12:08.496293: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1267]      0 
2021-01-22 00:12:08.496340: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1280] 0:   N 
2021-01-22 00:12:08.496454: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1406] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 5939 MB memory) -> physical GPU (device: 0, name: GeForce RTX 3070, pci bus id: 0000:01:00.0, compute capability: 8.6)
2021-01-22 00:12:08.496788: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense (Dense)                (None, 65536)             6553600   
_________________________________________________________________
batch_normalization (BatchNo (None, 65536)             262144    
_________________________________________________________________
leaky_re_lu (LeakyReLU)      (None, 65536)             0         
_________________________________________________________________
reshape (Reshape)            (None, 16, 16, 256)       0         
_________________________________________________________________
conv2d_transpose (Conv2DTran (None, 16, 16, 256)       3211264   
_________________________________________________________________
batch_normalization_1 (Batch (None, 16, 16, 256)       1024      
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 16, 16, 256)       0         
_________________________________________________________________
conv2d_transpose_1 (Conv2DTr (None, 32, 32, 128)       1605632   
_________________________________________________________________
batch_normalization_2 (Batch (None, 32, 32, 128)       512       
_________________________________________________________________
leaky_re_lu_2 (LeakyReLU)    (None, 32, 32, 128)       0         
_________________________________________________________________
conv2d_transpose_2 (Conv2DTr (None, 64, 64, 64)        401408    
_________________________________________________________________
batch_normalization_3 (Batch (None, 64, 64, 64)        256       
_________________________________________________________________
leaky_re_lu_3 (LeakyReLU)    (None, 64, 64, 64)        0         
_________________________________________________________________
conv2d_transpose_3 (Conv2DTr (None, 128, 128, 3)       4800      
=================================================================
Total params: 12,040,640
Trainable params: 11,908,672
Non-trainable params: 131,968
_________________________________________________________________
2021-01-22 00:12:08.793121: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublas64_11.dll
2021-01-22 00:12:09.334054: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cublasLt64_11.dll
2021-01-22 00:12:09.335006: I tensorflow/stream_executor/cuda/cuda_blas.cc:1838] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2021-01-22 00:12:09.337482: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library cudnn64_8.dll
2021-01-22 00:12:10.319075: I tensorflow/core/platform/windows/subprocess.cc:308] SubProcess ended with return code: 0

2021-01-22 00:12:10.349989: I tensorflow/core/platform/windows/subprocess.cc:308] SubProcess ended with return code: 0

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
conv2d (Conv2D)              (None, 64, 64, 32)        4736      
_________________________________________________________________
leaky_re_lu_4 (LeakyReLU)    (None, 64, 64, 32)        0         
_________________________________________________________________
dropout (Dropout)            (None, 64, 64, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 32, 32, 64)        51264     
_________________________________________________________________
leaky_re_lu_5 (LeakyReLU)    (None, 32, 32, 64)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 32, 32, 64)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 16, 16, 128)       204928    
_________________________________________________________________
leaky_re_lu_6 (LeakyReLU)    (None, 16, 16, 128)       0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 16, 16, 128)       0         
_________________________________________________________________
flatten (Flatten)            (None, 32768)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 32769     
=================================================================
Total params: 293,697
Trainable params: 293,697
Non-trainable params: 0
_________________________________________________________________
Loading Dataset
Finished Loading Dataset
(46957, 128, 128, 3)
Finished Organizing Dataset
No pickle found
2021-01-22 00:12:42.680822: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 9232121856 exceeds 10% of free system memory.

标签: python-3.xkerastensorflow2.0tensorflow-datasets

解决方案


最终,如果由于缓存速度下降,即使使用 NVMe 驱动器,您也不会看到 100% 的内存使用率。除此之外,TensorFlow 会预测性地抛出该标志。TensorFlow 上的 GPU 计算往往占用大量 RAM。我最好的建议是将批量大小降低 2 倍,直到它运行为止。

我不确定它是否相关的另一点是,在 nVidia 的 Isaac 应用程序论坛中报告了 RTX3xxx 卡的大量问题。


推荐阅读