tensorflow - 为什么我的网络使用 GPU 内存非常小?
问题描述
如标题所示,我有一个要训练的网络,训练数据为 670000 个样本,大小约为 15G。但是当我在 2 v-100 中运行它时,GPU 内存使用量始终为 305M。使用 v-100 运行
而且我不得不改变批量大小,甚至改变另一个网络,GPU内存使用量仍然是305M,这是怎么回事?最重要的是,当我将此代码复制到另一个具有 P-100 的 linux 时,它每个 epoch 大约运行 15 分钟。具有 v-100 资源(CPU、内存)的 linux 都比 p-100 更好。这是我的软件版本:
张量流:1.14
凯拉斯:2.2.4
麻木:1.17.4
这是我的模型代码:
class AttentionLayer(Layer):
def __init__(self, step_dim, W_regularizer=None, b_regularizer=None,
W_constraint=None, b_constraint=None,
bias=True, **kwargs):
self.supports_masking = True
self.init = initializers.get('glorot_uniform')
self.W_regularizer = regularizers.get(W_regularizer)
self.b_regularizer = regularizers.get(b_regularizer)
self.W_constraint = constraints.get(W_constraint)
self.b_constraint = constraints.get(b_constraint)
self.bias = bias
self.step_dim = step_dim
self.features_dim = 0
super(AttentionLayer, self).__init__(**kwargs)
def compute_mask(self, inputs, mask=None):
return None
def build(self, input_shape):
assert len(input_shape) == 3
self.W = self.add_weight((input_shape[-1],),
initializer=self.init,
name='{}_W'.format(self.name),
regularizer=self.W_regularizer,
constraint=self.W_constraint)
self.features_dim = input_shape[-1]
if self.bias:
self.b = self.add_weight((input_shape[1],),
initializer='zero',
name='{}_b'.format(self.name),
regularizer=self.b_regularizer,
constraint=self.b_constraint)
else:
self.b = None
self.built = True
# input (None,sentence_length,embedding_size)
def call(self, x, mask=None):
features_dim = self.features_dim
step_dim = self.step_dim
eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
if self.bias:
eij += self.b
eij = K.tanh(eij)
a = K.exp(eij)
# apply mask after the exp. will be re-normalized next
if mask is not None:
# Cast the mask to floatX to avoid float64 upcasting in theano
a *= K.cast(mask, K.floatx())
# in some cases especially in the early stages of training the sum may be almost zero
a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
a = K.expand_dims(a)
weighted_input = x * a
# print weigthted_input.shape
return K.sum(weighted_input, axis=1)
def compute_output_shape(self, input_shape):
return input_shape[0], self.features_dim
def get_config(self):
config = {'step_dim': self.step_dim}
base_config = super(AttentionLayer, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def create_lstm_model(id_col_list, embedding_matrix_list, class_num):
input_list = []
concate_layer_list = []
for i in range(len(id_col_list)):
id_col = id_col_list[i]
input = Input(shape=(tx_ad_2020_util.id_seq_max_len[id_col],), dtype='int32')
input_list.append(input)
embedding_matrix = embedding_matrix_list[i]
embedding_layer = Embedding(
input_dim=len(embedding_matrix, ),
output_dim=tx_ad_2020_util.id_embedding_size[id_col],
weights=[embedding_matrix],
trainable=True,
input_length=tx_ad_2020_util.id_seq_max_len[id_col]
)
attention_layer = AttentionLayer(tx_ad_2020_util.id_seq_max_len[id_col])
bi_lstm_layer = Bidirectional(LSTM(128))
s1 = embedding_layer(input)
s1_bi = bi_lstm_layer(s1)
s1_att = attention_layer(s1)
s1_last = Concatenate(axis=1)([s1_att, s1_bi])
#
s1_att_den=Dense(64)(s1_att)
concate_layer_list.append(s1_att_den)
last_list_layer = Concatenate(axis=1)(concate_layer_list)
last_list_layer = Dropout(rate=0.4)(last_list_layer)
# Dense
dense_layer1 = Dense(64,activation='softmax')(last_list_layer)
dense_layer2 = Dense(64,activation='sigmoid')(last_list_layer)
output_layer = Concatenate(axis=1)([dense_layer1, dense_layer2])
output_layer = Dense(class_num, activation='softmax')(output_layer)
model = Model(
inputs=input_list,
outputs=[output_layer], name="lstm_dssm"
)
if class_num == 2:
model.compile(
loss="binary_crossentropy",
optimizer=optimizers.Adadelta(lr=1.0, rho=0.95, epsilon=None, decay=0.0),
metrics=["binary_accuracy"]
)
else:
model.compile(
loss=categorical_crossentropy,
optimizer=optimizers.Adadelta(lr=1.0, rho=0.95, epsilon=None, decay=0.0),
metrics=["categorical_accuracy"]
)
return model
输入的数量是15。有类似时间序列的数据,首先我使用word2vec训练它们得到嵌入矩阵,然后我将它们放入keras嵌入层,该矩阵作为kears嵌入层中权重参数的起始值。
这是我的 word2vec 代码:
def train_id_w2v_model(id_col, embedding_size):
'''
:param id_col:
:param embedding_size:
:return:
'''
basic_path = tx_ad_2020_util.project_data_path
if tx_ad_2020_util.is_test:
basic_path = tx_ad_2020_util.project_data_test_path
texts = tx_ad_2020_util.load(os.path.join(basic_path, id_col + "_texts_all.pickle"))
texts = [line.split(" ") for line in texts]
model = word2vec.Word2Vec(sentences=texts, size=embedding_size, window=10, workers=2, sg=1)
model.wv.save_word2vec_format(os.path.join(basic_path, id_col + "_word_w2v.bigram"), binary=True)
def get_id_embedding_matrix(id_col, max_vocab_size, embedding_size, seq_max_len):
basic_path = tx_ad_2020_util.project_data_path
if tx_ad_2020_util.is_test:
basic_path = tx_ad_2020_util.project_data_test_path
all_texts = tx_ad_2020_util.load(os.path.join(basic_path, id_col + "_texts_all.pickle"))
tokenizer = Tokenizer(
num_words=max_vocab_size,
split=' ',
lower=False,
char_level=False,
filters=''
)
tokenizer.fit_on_texts(all_texts)
tx_ad_2020_util.save(tokenizer, os.path.join(basic_path, id_col + '_word_tokenizer.pickle'))
# del all_texts
train_texts = tx_ad_2020_util.load(os.path.join(basic_path, id_col + "_texts_train.pickle"))
train_texts_seq = tokenizer.texts_to_sequences(train_texts)
train_texts_seq = sequence.pad_sequences(train_texts_seq, maxlen=seq_max_len)
dev_texts = tx_ad_2020_util.load(os.path.join(basic_path, id_col + "_texts_dev.pickle"))
dev_texts_seq = tokenizer.texts_to_sequences(dev_texts)
dev_texts_seq = sequence.pad_sequences(dev_texts_seq, maxlen=seq_max_len)
test_texts = tx_ad_2020_util.load(os.path.join(basic_path, id_col + "_texts_test.pickle"))
test_texts_seq = tokenizer.texts_to_sequences(test_texts)
test_texts_seq = sequence.pad_sequences(test_texts_seq, maxlen=seq_max_len)
tx_ad_2020_util.save(train_texts_seq, os.path.join(basic_path, id_col + "_pad_seq_train.pickle"))
tx_ad_2020_util.save(dev_texts_seq, os.path.join(basic_path, id_col + "_pad_seq_dev.pickle"))
tx_ad_2020_util.save(test_texts_seq, os.path.join(basic_path, id_col + "_pad_seq_test.pickle"))
word_index_dict = tokenizer.word_index
w2v_model = KeyedVectors.load_word2vec_format(os.path.join(basic_path, id_col + "_word_w2v.bigram"), binary=True)
embedding_matrix = 1 * np.random.randn(len(word_index_dict) + 1, embedding_size)
embedding_matrix[0] = np.random.randn(embedding_size)
for word, index in word_index_dict.items():
if word in w2v_model.wv.vocab:
embedding_matrix[index] = w2v_model.word_vec(word)
tx_ad_2020_util.save(embedding_matrix, os.path.join(basic_path, id_col + '_embedding_matrix.pickle'))
del w2v_model
del word_index_dict
del tokenizer
gc.collect()
解决方案
推荐阅读
- c# - IComparable<> 作为通用约束有什么特别之处
- xamarin.forms - VS for Mac 中的 Xamarin.Android 预览问题
- kubernetes - 如何创建从 google kubernetes 引擎集群到外部 IP 地址的端口转发?
- ios - PhoneGap 不显示 iOS 版本
- css - 如何从页面中删除范围编号 ID 元素?
- excel - 如何修复损坏的 (VBA) Excel/XLSM?(通常是“自动化错误(2147467259)未指定错误”)
- postgresql - Postgres 日期范围可以包含无穷大作为上限吗?
- lua - git bash 上的 Lua 不起作用,适用于 CMD 窗口
- reactjs - Webencore 失败:检测到重复的插件/预设
- docker - Docker - 在哪里查看服务规模的完整日志