首页 > 解决方案 > 为什么我的网络使用 GPU 内存非常小?

问题描述

如标题所示,我有一个要训练的网络,训练数据为 670000 个样本,大小约为 15G。但是当我在 2 v-100 中运行它时,GPU 内存使用量始终为 305M。使用 v-100 运行

而且我不得不改变批量大小,甚至改变另一个网络,GPU内存使用量仍然是305M,这是怎么回事?最重要的是,当我将此代码复制到另一个具有 P-100 的 linux 时,它每个 epoch 大约运行 15 分钟。具有 v-100 资源(CPU、内存)的 linux 都比 p-100 更好。这是我的软件版本:

张量流:1.14

凯拉斯:2.2.4

麻木:1.17.4

这是我的模型代码:


class AttentionLayer(Layer):
    def __init__(self, step_dim, W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0

        super(AttentionLayer, self).__init__(**kwargs)

    def compute_mask(self, inputs, mask=None):
        return None


    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]
        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None
        self.built = True

    # input (None,sentence_length,embedding_size)
    def call(self, x, mask=None):

        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        # print weigthted_input.shape
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], self.features_dim

    def get_config(self):
        config = {'step_dim': self.step_dim}
        base_config = super(AttentionLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))


def create_lstm_model(id_col_list, embedding_matrix_list, class_num):
    input_list = []
    concate_layer_list = []
    for i in range(len(id_col_list)):
        id_col = id_col_list[i]
        input = Input(shape=(tx_ad_2020_util.id_seq_max_len[id_col],), dtype='int32')
        input_list.append(input)
        embedding_matrix = embedding_matrix_list[i]
        embedding_layer = Embedding(
            input_dim=len(embedding_matrix, ),
            output_dim=tx_ad_2020_util.id_embedding_size[id_col],
            weights=[embedding_matrix],
            trainable=True,
            input_length=tx_ad_2020_util.id_seq_max_len[id_col]
        )
        attention_layer = AttentionLayer(tx_ad_2020_util.id_seq_max_len[id_col])
        bi_lstm_layer = Bidirectional(LSTM(128))
        s1 = embedding_layer(input)

        s1_bi = bi_lstm_layer(s1)

        s1_att = attention_layer(s1)

        s1_last = Concatenate(axis=1)([s1_att, s1_bi])

        #
        s1_att_den=Dense(64)(s1_att)

        concate_layer_list.append(s1_att_den)



    last_list_layer = Concatenate(axis=1)(concate_layer_list)

    last_list_layer = Dropout(rate=0.4)(last_list_layer)
    # Dense 
    dense_layer1 = Dense(64,activation='softmax')(last_list_layer)
    dense_layer2 = Dense(64,activation='sigmoid')(last_list_layer)

    output_layer = Concatenate(axis=1)([dense_layer1, dense_layer2])

    output_layer = Dense(class_num, activation='softmax')(output_layer)

    model = Model(
        inputs=input_list,
        outputs=[output_layer], name="lstm_dssm"

    )
    if class_num == 2:
        model.compile(
            loss="binary_crossentropy",
            optimizer=optimizers.Adadelta(lr=1.0, rho=0.95, epsilon=None, decay=0.0),
            metrics=["binary_accuracy"]
        )
    else:
        model.compile(
            loss=categorical_crossentropy,
            optimizer=optimizers.Adadelta(lr=1.0, rho=0.95, epsilon=None, decay=0.0),
            metrics=["categorical_accuracy"]
        )
    return model

输入的数量是15。有类似时间序列的数据,首先我使用word2vec训练它们得到嵌入矩阵,然后我将它们放入keras嵌入层,该矩阵作为kears嵌入层中权重参数的起始值。

这是我的 word2vec 代码:

def train_id_w2v_model(id_col, embedding_size):
    '''
    :param id_col:
    :param embedding_size:
    :return:
    '''
    basic_path = tx_ad_2020_util.project_data_path
    if tx_ad_2020_util.is_test:
        basic_path = tx_ad_2020_util.project_data_test_path

    texts = tx_ad_2020_util.load(os.path.join(basic_path, id_col + "_texts_all.pickle"))
    texts = [line.split(" ") for line in texts]
    model = word2vec.Word2Vec(sentences=texts, size=embedding_size, window=10, workers=2, sg=1)
    model.wv.save_word2vec_format(os.path.join(basic_path, id_col + "_word_w2v.bigram"), binary=True)


def get_id_embedding_matrix(id_col, max_vocab_size, embedding_size, seq_max_len):

    basic_path = tx_ad_2020_util.project_data_path
    if tx_ad_2020_util.is_test:
        basic_path = tx_ad_2020_util.project_data_test_path

    all_texts = tx_ad_2020_util.load(os.path.join(basic_path, id_col + "_texts_all.pickle"))
    tokenizer = Tokenizer(
        num_words=max_vocab_size,
        split=' ',
        lower=False,
        char_level=False,
        filters=''
    )
    tokenizer.fit_on_texts(all_texts)
    tx_ad_2020_util.save(tokenizer, os.path.join(basic_path, id_col + '_word_tokenizer.pickle'))

    # del all_texts

    train_texts = tx_ad_2020_util.load(os.path.join(basic_path, id_col + "_texts_train.pickle"))
    train_texts_seq = tokenizer.texts_to_sequences(train_texts)
    train_texts_seq = sequence.pad_sequences(train_texts_seq, maxlen=seq_max_len)

    dev_texts = tx_ad_2020_util.load(os.path.join(basic_path, id_col + "_texts_dev.pickle"))
    dev_texts_seq = tokenizer.texts_to_sequences(dev_texts)
    dev_texts_seq = sequence.pad_sequences(dev_texts_seq, maxlen=seq_max_len)

    test_texts = tx_ad_2020_util.load(os.path.join(basic_path, id_col + "_texts_test.pickle"))
    test_texts_seq = tokenizer.texts_to_sequences(test_texts)
    test_texts_seq = sequence.pad_sequences(test_texts_seq, maxlen=seq_max_len)

    tx_ad_2020_util.save(train_texts_seq, os.path.join(basic_path, id_col + "_pad_seq_train.pickle"))
    tx_ad_2020_util.save(dev_texts_seq, os.path.join(basic_path, id_col + "_pad_seq_dev.pickle"))
    tx_ad_2020_util.save(test_texts_seq, os.path.join(basic_path, id_col + "_pad_seq_test.pickle"))

    word_index_dict = tokenizer.word_index
    w2v_model = KeyedVectors.load_word2vec_format(os.path.join(basic_path, id_col + "_word_w2v.bigram"), binary=True)
    embedding_matrix = 1 * np.random.randn(len(word_index_dict) + 1, embedding_size)
    embedding_matrix[0] = np.random.randn(embedding_size)
    for word, index in word_index_dict.items():
        if word in w2v_model.wv.vocab:
            embedding_matrix[index] = w2v_model.word_vec(word)
    tx_ad_2020_util.save(embedding_matrix, os.path.join(basic_path, id_col + '_embedding_matrix.pickle'))
    del w2v_model
    del word_index_dict
    del tokenizer
    gc.collect()

标签: tensorflow

解决方案


推荐阅读