python - 我想用 lstm 抽象文本
问题描述
我正在做一个网络抓取项目。我有 100 个链接,我应该抓取所有链接的第一页。所以我的文本长度是动态的。我使用 nltk 进行总结,最后,我使用从 lstm 中的帖子中的链接获取的代码来抽象它们。但在输入函数中,我有一个尺寸错误,因为我不知道如何获得最大长度。请帮我解决这个问题。在第 73 行之后,lstm 开始。实际上,我将这篇文章用于 lstm。这是链接https://www.analyticsvidhya.com/blog/2019/06/comprehensive-guide-text-summarization-using-deep-learning-python/。谢谢
from googlesearch import search
from bs4 import BeautifulSoup
from matplotlib.pyplot import plot
from selenium import webdriver
from selenium.webdriver.common import keys
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import PyPDF2
import re
from attention import AttentionLayer
from PyPDF2 import PdfFileReader
import pandas as pd
import datagp as d
import numpy as np
import re
from bs4 import BeautifulSoup
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
import warnings
max_text_len=30
max_summary_len=8
# to search
pd.set_option("display.max_colwidth", 200)
warnings.filterwarnings("ignore")
def search_google():
query = input('enter the string: ')
chrome_path=r'D:\python_files\chromedriver.exe'
driver = webdriver.Chrome(executable_path=chrome_path)
links = []
for j in search(query, tld="com", num=3, stop=3, pause=2):
links.append(j)
first_page=[]
result={}
for l in links:
if re.search(".pdf",l)==None:
try:
driver.get(l)
elm=driver.find_elements_by_tag_name('p')
first_page=''.join ([i.text for i in elm])
if first_page!='':
sum,join_sent,sen_list=d.summary(first_page)
w_cloud=d.wordcloud_plot(d.frequency_dist(sen_list))
case={'Summary':sum,'Text':join_sent}
result[l]=case
except :
continue
r_list=list(result.values())
df=pd.DataFrame(result)
df=df.transpose()
short_text=''.join([x['Text'] for x in r_list])
short_summary=''.join([x['Summary'] for x in r_list])
df.to_csv('d:\\result.csv')
return short_text,short_summary,df
short_text,short_summary,data=search_google()
cleaned_text = []
for t in data['Text']:
cleaned_text.append(t)
cleaned_summary = []
for t in data['Summary']:
cleaned_summary.append(t)
data.replace('', np.nan, inplace=True)
data.dropna(axis=0,inplace=True)
data['cleaned_text']=cleaned_text
data['cleaned_summary']=cleaned_summary
cleaned_text =np.array(data['cleaned_text'])
cleaned_summary=np.array(data['cleaned_summary'])
short_text=[]
short_summary=[]
for i in range(len(cleaned_text)):
short_text.append(cleaned_text[i])
short_summary.append(cleaned_summary[i])
cnt=0
for i in data['cleaned_summary']:
if(len(i.split())<=8):
cnt=cnt+1
print(cnt/len(data['cleaned_summary']))
df=pd.DataFrame({'text':short_text,'summary':short_summary})
df['summary'] = df['summary'].apply(lambda x : 'sostok '+ x + ' eostok')
from sklearn.model_selection import train_test_split
x_tr,x_val,y_tr,y_val=train_test_split(np.array(df['text']),np.array(df['summary']),test_size=0.1,random_state=0,shuffle=True)
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
#prepare a tokenizer for reviews on training data
x_tokenizer = Tokenizer()
x_tokenizer.fit_on_texts(list(x_tr))
thresh=4
cnt=0
tot_cnt=0
freq=0
tot_freq=0
for key,value in x_tokenizer.word_counts.items():
tot_cnt=tot_cnt+1
tot_freq=tot_freq+value
if(value<thresh):
cnt=cnt+1
freq=freq+value
#prepare a tokenizer for reviews on training data
x_tokenizer = Tokenizer(num_words=tot_cnt-cnt)
x_tokenizer.fit_on_texts(list(x_tr))
#convert text sequences into integer sequences
x_tr_seq = x_tokenizer.texts_to_sequences(x_tr)
x_val_seq = x_tokenizer.texts_to_sequences(x_val)
#padding zero upto maximum length
x_tr = pad_sequences(x_tr_seq, maxlen=max_text_len, padding='post')
x_val = pad_sequences(x_val_seq, maxlen=max_text_len, padding='post')
#size of vocabulary ( +1 for padding token)
x_voc = x_tokenizer.num_words + 1
#prepare a tokenizer for reviews on training data
y_tokenizer = Tokenizer()
y_tokenizer.fit_on_texts(list(y_tr))
thresh=6
cnt=0
tot_cnt=0
freq=0
tot_freq=0
for key,value in y_tokenizer.word_counts.items():
tot_cnt=tot_cnt+1
tot_freq=tot_freq+value
if(value<thresh):
cnt=cnt+1
freq=freq+value
#prepare a tokenizer for reviews on training data
y_tokenizer = Tokenizer(num_words=tot_cnt-cnt)
y_tokenizer.fit_on_texts(list(y_tr))
#convert text sequences into integer sequences
y_tr_seq = y_tokenizer.texts_to_sequences(y_tr)
y_val_seq = y_tokenizer.texts_to_sequences(y_val)
#padding zero upto maximum length
y_tr = pad_sequences(y_tr_seq, maxlen=max_summary_len, padding='post')
y_val = pad_sequences(y_val_seq, maxlen=max_summary_len, padding='post')
#size of vocabulary
y_voc = y_tokenizer.num_words +1
y_tokenizer.word_counts['sostok'],len(y_tr)
ind=[]
for i in range(len(y_tr)):
cnt=0
for j in y_tr[i]:
if j!=0:
cnt=cnt+1
if(cnt==2):
ind.append(i)
y_tr=np.delete(y_tr,ind, axis=0)
x_tr=np.delete(x_tr,ind, axis=0)
ind=[]
for i in range(len(y_val)):
cnt=0
for j in y_val[i]:
if j!=0:
cnt=cnt+1
if(cnt==2):
ind.append(i)
y_val=np.delete(y_val,ind, axis=0)
x_val=np.delete(x_val,ind, axis=0)
from keras import backend as K
K.clear_session()
latent_dim = 300
embedding_dim=100
# Encoder
encoder_inputs = Input(shape=(max_text_len,))
#embedding layer
enc_emb = Embedding(x_voc, embedding_dim,trainable=True)(encoder_inputs)
#encoder lstm 1
encoder_lstm1 = LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.4,recurrent_dropout=0.4)
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb)
#encoder lstm 2
encoder_lstm2 = LSTM(latent_dim,return_sequences=True,return_state=True,dropout=0.4,recurrent_dropout=0.4)
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1)
#encoder lstm 3
encoder_lstm3=LSTM(latent_dim, return_state=True, return_sequences=True,dropout=0.4,recurrent_dropout=0.4)
encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2)
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
#embedding layer
dec_emb_layer = Embedding(y_voc, embedding_dim,trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True,dropout=0.4,recurrent_dropout=0.2)
decoder_outputs,decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb,initial_state=[state_h, state_c])
# Attention layer
attn_layer = AttentionLayer(name='attention_layer')
attn_out, attn_states = attn_layer([encoder_outputs, decoder_outputs])
# Concat attention input and decoder LSTM output
decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attn_out])
#dense layer
decoder_dense = TimeDistributed(Dense(y_voc, activation='softmax'))
decoder_outputs = decoder_dense(decoder_concat_input)
# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=2)
history=model.fit([x_tr,y_tr[:,:-1]], y_tr.reshape(y_tr.shape[0],y_tr.shape[1], 1)[:,1:] ,epochs=50,callbacks=[es],batch_size=128, validation_data=([x_val,y_val[:,:-1]], y_val.reshape(y_val.shape[0],y_val.shape[1], 1)[:,1:]))
reverse_target_word_index=y_tokenizer.index_word
reverse_source_word_index=x_tokenizer.index_word
target_word_index=y_tokenizer.word_index
# Encode the input sequence to get the feature vector
encoder_model = Model(inputs=encoder_inputs,outputs=[encoder_outputs, state_h, state_c])
# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_hidden_state_input = Input(shape=(max_text_len,latent_dim))
# Get the embeddings of the decoder sequence
dec_emb2= dec_emb_layer(decoder_inputs)
# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])
#attention inference
attn_out_inf, attn_states_inf = attn_layer([decoder_hidden_state_input, decoder_outputs2])
decoder_inf_concat = Concatenate(axis=-1, name='concat')([decoder_outputs2, attn_out_inf])
# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs2 = decoder_dense(decoder_inf_concat)
# Final decoder model
decoder_model = Model(
[decoder_inputs] + [decoder_hidden_state_input,decoder_state_input_h, decoder_state_input_c],
[decoder_outputs2] + [state_h2, state_c2])
def decode_sequence(input_seq):
# Encode the input as state vectors.
e_out, e_h, e_c = encoder_model.predict(input_seq)
# Generate empty target sequence of length 1.
target_seq = np.zeros((1,1))
# Populate the first word of target sequence with the start word.
target_seq[0, 0] = target_word_index['sostok']
stop_condition = False
decoded_sentence = ''
while not stop_condition:
output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])
# Sample a token
sampled_token_index = np.argmax(output_tokens[0, -1, :])
sampled_token = reverse_target_word_index[sampled_token_index]
if(sampled_token!='eostok'):
decoded_sentence += ' '+sampled_token
# Exit condition: either hit max length or find stop word.
if (sampled_token == 'eostok' or len(decoded_sentence.split()) >= (max_summary_len-1)):
stop_condition = True
# Update the target sequence (of length 1).
target_seq = np.zeros((1,1))
target_seq[0, 0] = sampled_token_index
# Update internal states
e_h, e_c = h, c
return decoded_sentence
def seq2summary(input_seq):
newString=''
for i in input_seq:
if((i!=0 and i!=target_word_index['sostok']) and i!=target_word_index['eostok']):
newString=newString+reverse_target_word_index[i]+' '
return newString
def seq2text(input_seq):
newString=''
for i in input_seq:
if(i!=0):
newString=newString+reverse_source_word_index[i]+' '
return newString
for i in range(0,3):
print("Review:",seq2text(x_tr[i]))
print("Original summary:",seq2summary(y_tr[i]))
print("Predicted summary:",decode_sequence(x_tr[i].reshape(1,max_text_len)))
print("\n")
解决方案
标记short_summary:每个short_summary 标记并找到最长的列表,这将为您提供max_length。示例代码片段
vocabulary = []
import tensorflow_datasets as tfds
tokenizer = tfds.features.text.Tokenizer()
for text_tokens in short_summary:
some_tokens = tokenizer.tokenize(text_token.numpy())
vocabulary.update(some_tokens)
Find the longest sentence,
import numpy as np
def find_max_list_idx(list):
list_len = [len(i) for i in list]
return np.argmax(np.array(list_len))
推荐阅读
- javascript - React Native 无法在异步函数中正确设置超时
- vue.js - 使用插槽范围问题的元素 UI 表列“无法读取未定义的属性 'column_name'”
- wso2 - WSO2:覆盖 MQ 的 JMS 生产者 JNDI 属性
- gitlab - gitlab runner 并发如何工作?
- sql - 蜂巢中的简单表格转置
- html - Angular 6动画不显示
- javascript - 与 JavaScript 和 PHP 相关的语法
- algorithm - 以递归方式编辑距离相似算法
- c# - C#通过引用传递数组
- ruby-on-rails - Rails 5.2 credentials.yml.enc ERB 不解析