tensorflow - bilstm 和 attention 寻找文本的主题表示
问题描述
#Preprocessing of data
df = pd.read_csv("small_quac.csv")
df = df.drop(['Unnamed: 0'], axis = 1)
shared_topic, section_title, for_tokenize = read_data(df)
# Define x_train and x_test
x_train = np.asarray(shared_topic)
y_train = np.asarray(section_title)
# Find max_seq_len
max_seq_len_x = get_max_seq_len(x_train, remove_stopwords=False)
max_seq_len_y = get_max_seq_len(y_train, remove_stopwords=False)
max_seq_len = max(max_seq_len_x, max_seq_len_y)
tokenizer = Tokenizer(filters='\n')
tokenizer.fit_on_texts(for_tokenize)
vocab_size = len(tokenizer.word_index) + 1
X = tokenizer.texts_to_sequences(x_train)
y = tokenizer.texts_to_sequences(y_train)
# print(X[0])
word2idx = tokenizer.word_index
idx2word = tokenizer.index_word
fdist = tokenizer.word_counts
X = pad_sequences(X, maxlen=max_seq_len_x, padding='post')
y = pad_sequences(y, maxlen=max_seq_len_y, padding='post')
# from here modelling starts
rnn_cell_size = 128
max_seq_len_y = 14
max_seq_len_x = 139
class Attention(tf.keras.Model):
def __init__(self, units):
super(Attention, self).__init__()
self.W1 = tf.keras.layers.Dense(units)
self.W2 = tf.keras.layers.Dense(units)
self.V = tf.keras.layers.Dense(1)
def call(self, features, hidden):
hidden_with_time_axis = tf.expand_dims(hidden, 1)
score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))
attention_weights = tf.nn.softmax(self.V(score), axis=1)
context_vector = attention_weights * features
context_vector = tf.reduce_sum(context_vector, axis=1)
return context_vector, attention_weights
sequence_input = tf.keras.layers.Input(shape=(max_seq_len_x,))
embedded_sequences = tf.keras.layers.Embedding(vocab_size,
300, weights=[embedding_matrix],
trainable=False, mask_zero=True, name='Encoder-Word-Embedding')(sequence_input)
lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM
(rnn_cell_size,
dropout=0.3,
return_sequences=True,
return_state=True,
recurrent_activation='relu',
recurrent_initializer='glorot_uniform'), name="bi_lstm_0")(embedded_sequences)
lstm, forward_h, forward_c, backward_h, backward_c = tf.keras.layers.Bidirectional \
(tf.keras.layers.LSTM
(rnn_cell_size,
dropout=0.2,
return_sequences=True,
return_state=True,
recurrent_activation='relu',
recurrent_initializer='glorot_uniform'))(lstm)
state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])
state_c = tf.keras.layers.Concatenate()([forward_c, backward_c])
context_vector, attention_weights = Attention(32)(lstm, state_h)
output = keras.layers.Dense(max_seq_len_y, activation='softmax')(context_vector)
model = keras.Model(inputs=sequence_input, outputs=output)
# summarize layers
print(model.summary())
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
history = model.fit(x=X, y=y,epochs=30)
另外,我正在使用 300 尺寸的手套嵌入。
这里我的 X 是一个形状矩阵 (59,139),其中 59 = 样本数,139 = 我的文本行中最大句子的长度。这 139 个值填充了我的词汇表中的 word2idx。
Y 是一个形状为 (59, 14) 的矩阵,其中 59= 上面相同,14 = 我的最大标题的长度,并填充了 word2idx 的词汇。
例如我想要这个:
输入:
array([293, 40, 294, 129, 75, 130, 129, 131, 295, 296, 132, 297, 298,
2, 299, 34, 12, 76, 300, 27, 301, 15, 1, 302, 133, 4,
77, 303, 3, 134, 304, 78, 34, 305, 11, 306, 307, 4, 1,
132, 135, 22, 10, 308, 11, 136, 4, 1, 309, 50, 4, 310,
11, 78, 311, 312, 3, 77, 1, 313, 130, 10, 137, 11, 12,
109, 7, 314, 315, 7, 1, 76, 316, 4, 317, 318, 34, 138,
319, 139, 320, 3, 77, 321, 79, 322, 4, 1, 323, 324, 4,
1, 325, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0])
输出:
array([1040, 1041, 2, 1042, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
请帮帮我,我花了这么多天来找到方法,但我找不到。
解决方案
推荐阅读
- sql - 如何在本地更改水晶报表上的 sql 查询而不从我的机器访问客户的数据库?
- node.js - moment.js 在不同的时区给出错误的时间
- c# - C# 在批量大数据集上并行运行函数
- python - 程序终止时最终 Tkinter 窗口不会关闭
- swift - 私有集合类及其方法 Swift 4
- mysql - 使用 MYSQL Workbench 查询将数据 (DATETIME) 加载到 MySQL 时出现错误 1265
- google-apps-script - 谷歌脚本。自动创建触发器
- javascript - 调度程序在接收反应本机的参数之前发送请求
- c# - 如何在c#中正确使用soap webservice客户端
- c++ - 此类设置是否违反 Liskov 替换原则