首页 > 解决方案 > 变压器模型(仅限编码器)不会学习

问题描述

我正在努力使用一种我试图用于序列分类的变压器模型。作为“嵌入”,我使用的是 CNN 预处理器的输出。

代码如下:

def generate_transformer_model(d_model=512,
                               seq_len = 5,
                               depth_feed_forward = 2048,
                                    cnn_kernel_size=10,
                                    cnn_pool_size=10,
                                    cnn_padding='same',
                                    num_trans_layers=10,
                                    num_attention_heads=10,
                              dropout_rate=0.1):


  # define auxillary function that generates positional encoding of vector accoring to Vaswani et al., 2016
  def positional_encoding(seq_len, d_model):

    def get_angles(pos, i, d_model):
     angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
     return pos * angle_rates
  
    angle_rads = get_angles(np.arange(seq_len)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)

    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

  def point_wise_feed_forward_network(d_model,dff):
    return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
  ])
    


  # model implementation starts here
  input = tf.keras.layers.Input(shape=(5000,12))    #could add input shape and batch size


  # need to add dropout to CNN layer
  cnn_preprocessor = tf.keras.Sequential([tf.keras.layers.Conv1D(filters=d_model/4,
                              kernel_size=cnn_kernel_size,
                              padding=cnn_padding,
                              activation='relu'),
       tf.keras.layers.MaxPool1D(cnn_pool_size,
                                 padding=cnn_padding),  # factor 10
       tf.keras.layers.Conv1D(filters=d_model/2,
                              kernel_size=cnn_kernel_size,
                              padding=cnn_padding,
                              activation='relu'),
       tf.keras.layers.MaxPool1D(cnn_pool_size,
                                 padding=cnn_padding),  # factor 100
       tf.keras.layers.Conv1D(filters=d_model,
                              kernel_size=cnn_kernel_size,
                              padding=cnn_padding,
                              activation='relu'),
       tf.keras.layers.MaxPool1D(cnn_pool_size,
                                 padding=cnn_padding)], # factor 1000
                            name='cnn_preprocessor')  # factor 1000

  cnn_preprocessor.build(input_shape=(None,5000,12))
  cnn_preprocessor.summary()

  embedding = cnn_preprocessor(input)
  normed_embedding = embedding * tf.math.sqrt(tf.cast(d_model, tf.float32))
  pos_encoding = positional_encoding(seq_len,d_model)

  transformer_input = normed_embedding + pos_encoding # should be based on CNN output

  

  class EncoderLayer(tf.keras.layers.Layer):
  
    def __init__(self, d_model, num_heads, dff, rate=0.1):
      super(EncoderLayer, self).__init__()

      self.mha = tf.keras.layers.MultiHeadAttention(d_model, num_heads)
      self.ffn = point_wise_feed_forward_network(d_model, dff)

      self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
      self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

      self.dropout1 = tf.keras.layers.Dropout(rate)
      self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, training):

      attn_output = self.mha(x, x, x)  # (batch_size, input_seq_len, d_model)
      attn_output = self.dropout1(attn_output, training=training)
      out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)

      ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
      ffn_output = self.dropout2(ffn_output, training=training)
      out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)

      return out2

  transformer = tf.keras.Sequential(name="Transformer")
  for i in range(num_trans_layers):
    transformer.add(EncoderLayer(d_model=d_model,
                                 num_heads=num_attention_heads,
                                 dff=depth_feed_forward))
  transformer_output = transformer(transformer_input)

  classifier = tf.keras.Sequential([tf.keras.layers.GlobalAveragePooling1D(), #replace with GlovalAveragePooling
                                    tf.keras.layers.Dense(4,activation='sigmoid')])
  
  output = classifier(transformer_output)

  model = tf.keras.Model(inputs=input,outputs=output,name='Transformer')

  return model

谁能给我提示一下设置有什么问题?我检查了尺寸,它们看起来很合理。该模型将编译,只是产生不比随机猜测准确度更好的结果。

是否可以以这种方式使用 CNN 过滤器的“学习”嵌入(在哪里n_filters=d_model)?

标签: pythontensorflowdeep-learningconv-neural-networktransformer

解决方案


推荐阅读