python - 变压器模型(仅限编码器)不会学习
问题描述
我正在努力使用一种我试图用于序列分类的变压器模型。作为“嵌入”,我使用的是 CNN 预处理器的输出。
代码如下:
def generate_transformer_model(d_model=512,
seq_len = 5,
depth_feed_forward = 2048,
cnn_kernel_size=10,
cnn_pool_size=10,
cnn_padding='same',
num_trans_layers=10,
num_attention_heads=10,
dropout_rate=0.1):
# define auxillary function that generates positional encoding of vector accoring to Vaswani et al., 2016
def positional_encoding(seq_len, d_model):
def get_angles(pos, i, d_model):
angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
return pos * angle_rates
angle_rads = get_angles(np.arange(seq_len)[:, np.newaxis],
np.arange(d_model)[np.newaxis, :],
d_model)
# apply sin to even indices in the array; 2i
angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
# apply cos to odd indices in the array; 2i+1
angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
pos_encoding = angle_rads[np.newaxis, ...]
return tf.cast(pos_encoding, dtype=tf.float32)
def point_wise_feed_forward_network(d_model,dff):
return tf.keras.Sequential([
tf.keras.layers.Dense(dff, activation='relu'), # (batch_size, seq_len, dff)
tf.keras.layers.Dense(d_model) # (batch_size, seq_len, d_model)
])
# model implementation starts here
input = tf.keras.layers.Input(shape=(5000,12)) #could add input shape and batch size
# need to add dropout to CNN layer
cnn_preprocessor = tf.keras.Sequential([tf.keras.layers.Conv1D(filters=d_model/4,
kernel_size=cnn_kernel_size,
padding=cnn_padding,
activation='relu'),
tf.keras.layers.MaxPool1D(cnn_pool_size,
padding=cnn_padding), # factor 10
tf.keras.layers.Conv1D(filters=d_model/2,
kernel_size=cnn_kernel_size,
padding=cnn_padding,
activation='relu'),
tf.keras.layers.MaxPool1D(cnn_pool_size,
padding=cnn_padding), # factor 100
tf.keras.layers.Conv1D(filters=d_model,
kernel_size=cnn_kernel_size,
padding=cnn_padding,
activation='relu'),
tf.keras.layers.MaxPool1D(cnn_pool_size,
padding=cnn_padding)], # factor 1000
name='cnn_preprocessor') # factor 1000
cnn_preprocessor.build(input_shape=(None,5000,12))
cnn_preprocessor.summary()
embedding = cnn_preprocessor(input)
normed_embedding = embedding * tf.math.sqrt(tf.cast(d_model, tf.float32))
pos_encoding = positional_encoding(seq_len,d_model)
transformer_input = normed_embedding + pos_encoding # should be based on CNN output
class EncoderLayer(tf.keras.layers.Layer):
def __init__(self, d_model, num_heads, dff, rate=0.1):
super(EncoderLayer, self).__init__()
self.mha = tf.keras.layers.MultiHeadAttention(d_model, num_heads)
self.ffn = point_wise_feed_forward_network(d_model, dff)
self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = tf.keras.layers.Dropout(rate)
self.dropout2 = tf.keras.layers.Dropout(rate)
def call(self, x, training):
attn_output = self.mha(x, x, x) # (batch_size, input_seq_len, d_model)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(x + attn_output) # (batch_size, input_seq_len, d_model)
ffn_output = self.ffn(out1) # (batch_size, input_seq_len, d_model)
ffn_output = self.dropout2(ffn_output, training=training)
out2 = self.layernorm2(out1 + ffn_output) # (batch_size, input_seq_len, d_model)
return out2
transformer = tf.keras.Sequential(name="Transformer")
for i in range(num_trans_layers):
transformer.add(EncoderLayer(d_model=d_model,
num_heads=num_attention_heads,
dff=depth_feed_forward))
transformer_output = transformer(transformer_input)
classifier = tf.keras.Sequential([tf.keras.layers.GlobalAveragePooling1D(), #replace with GlovalAveragePooling
tf.keras.layers.Dense(4,activation='sigmoid')])
output = classifier(transformer_output)
model = tf.keras.Model(inputs=input,outputs=output,name='Transformer')
return model
谁能给我提示一下设置有什么问题?我检查了尺寸,它们看起来很合理。该模型将编译,只是产生不比随机猜测准确度更好的结果。
是否可以以这种方式使用 CNN 过滤器的“学习”嵌入(在哪里n_filters=d_model
)?
解决方案
推荐阅读
- mysql - FOR 功能导致兼容性问题 寻找 FOR 功能的替代品
- javascript - 在 MS Word 插件 Javascript API 中读取表格边框的颜色和宽度
- c - Brainfuck解释器没有运行一些代码
- tensorflow - TensorFlow Dataset 的函数 cache() 和 prefetch() 有什么作用?
- python - 从文本pyspark中提取字符串
- ios - 仅当两个文本字段都填写时,如何在 Swift 中添加两个数字
- node.js - 是否有可能从机器人获得公会规模,而不是我的?
- c++ - 在不同文件之间传输变量
- sql-server - 创建现有数据库的副本并使用 Powershell 脚本对其进行重命名
- javascript - 如何匹配列表中的值,但在 Javascript 中返回它后面的值?