首页 > 解决方案 > bilstm 和 attention 寻找文本的主题表示

问题描述

#Preprocessing of data
df = pd.read_csv("small_quac.csv")
df = df.drop(['Unnamed: 0'], axis = 1)
shared_topic, section_title, for_tokenize = read_data(df)

# Define x_train and x_test
x_train = np.asarray(shared_topic)
y_train = np.asarray(section_title)

# Find max_seq_len
max_seq_len_x = get_max_seq_len(x_train, remove_stopwords=False)
max_seq_len_y = get_max_seq_len(y_train, remove_stopwords=False)
max_seq_len = max(max_seq_len_x, max_seq_len_y)

tokenizer = Tokenizer(filters='\n')
tokenizer.fit_on_texts(for_tokenize)
vocab_size = len(tokenizer.word_index) + 1


X = tokenizer.texts_to_sequences(x_train)
y = tokenizer.texts_to_sequences(y_train)

# print(X[0])

word2idx = tokenizer.word_index
idx2word = tokenizer.index_word
fdist = tokenizer.word_counts

X = pad_sequences(X, maxlen=max_seq_len_x, padding='post')
y = pad_sequences(y, maxlen=max_seq_len_y, padding='post')   


# from here modelling starts

rnn_cell_size = 128
max_seq_len_y = 14
max_seq_len_x = 139

class Attention(tf.keras.Model):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
    def call(self, features, hidden):
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))
        attention_weights = tf.nn.softmax(self.V(score), axis=1)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

sequence_input = tf.keras.layers.Input(shape=(max_seq_len_x,))

embedded_sequences = tf.keras.layers.Embedding(vocab_size,
                              300, weights=[embedding_matrix],
                              trainable=False, mask_zero=True, name='Encoder-Word-Embedding')(sequence_input)

lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM
                                     (rnn_cell_size,
                                      dropout=0.3,
                                      return_sequences=True,
                                      return_state=True,
                                      recurrent_activation='relu',
                                      recurrent_initializer='glorot_uniform'), name="bi_lstm_0")(embedded_sequences)

lstm, forward_h, forward_c, backward_h, backward_c = tf.keras.layers.Bidirectional \
    (tf.keras.layers.LSTM
     (rnn_cell_size,
      dropout=0.2,
      return_sequences=True,
      return_state=True,
      recurrent_activation='relu',
      recurrent_initializer='glorot_uniform'))(lstm)

state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])
state_c = tf.keras.layers.Concatenate()([forward_c, backward_c])

context_vector, attention_weights = Attention(32)(lstm, state_h)

output = keras.layers.Dense(max_seq_len_y, activation='softmax')(context_vector)

model = keras.Model(inputs=sequence_input, outputs=output)

# summarize layers
print(model.summary())


model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

history = model.fit(x=X, y=y,epochs=30)

另外,我正在使用 300 尺寸的手套嵌入。

这里我的 X 是一个形状矩阵 (59,139),其中 59 = 样本数,139 = 我的文本行中最大句子的长度。这 139 个值填充了我的词汇表中的 word2idx。

Y 是一个形状为 (59, 14) 的矩阵,其中 59= 上面相同,14 = 我的最大标题的长度,并填充了 word2idx 的词汇。

例如我想要这个:

输入:

array([293,  40, 294, 129,  75, 130, 129, 131, 295, 296, 132, 297, 298,
         2, 299,  34,  12,  76, 300,  27, 301,  15,   1, 302, 133,   4,
        77, 303,   3, 134, 304,  78,  34, 305,  11, 306, 307,   4,   1,
       132, 135,  22,  10, 308,  11, 136,   4,   1, 309,  50,   4, 310,
        11,  78, 311, 312,   3,  77,   1, 313, 130,  10, 137,  11,  12,
       109,   7, 314, 315,   7,   1,  76, 316,   4, 317, 318,  34, 138,
       319, 139, 320,   3,  77, 321,  79, 322,   4,   1, 323, 324,   4,
         1, 325,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0])

输出:

array([1040, 1041,    2, 1042,    0,    0,    0,    0,    0,    0,    0,  0,    0,    0])

请帮帮我,我花了这么多天来找到方法,但我找不到。

标签: tensorflowmachine-learningkerasdeep-learningnlp

解决方案


推荐阅读