python - 如何提高 lstm 训练的准确性
问题描述
我用 LSTM 训练了 quora 问题对检测,但训练精度非常低,并且在我训练时总是会发生变化。我不明白我犯了什么错误。
我尝试改变损失和优化器并增加时代。
import numpy as np
from numpy import array
from keras.callbacks import ModelCheckpoint
import keras
from keras.optimizers import SGD
import tensorflow as tf
from sklearn import preprocessing
import xgboost as xgb
from keras import backend as K
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from keras.preprocessing.text import Tokenizer , text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
from keras.models import Sequential, model_from_json, load_model
from keras.layers import LSTM, Dense, Input, concatenate, Concatenate, Activation, Flatten
from keras.models import Model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import nltk
from nltk.stem.lancaster import LancasterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import pickle
df = pd.read_csv("questions.csv")
df.drop(['id','qid1', 'qid2'], axis=1, inplace=True)
df2 = pd.read_csv("testmenew.csv")
## 过滤数据集
SPECIAL_TOKENS = {
'quoted': 'quoted_item',
'non-ascii': 'non_ascii_word',
'undefined': 'something'
}
def clean(text, stem_words=True):
import re
from string import punctuation
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
def pad_str(s):
return ' '+s+' '
if pd.isnull(text):
return ''
if type(text) != str or text=='':
return ''
text = re.sub("\'s", " ", text)
text = re.sub(" whats ", " what is ", text, flags=re.IGNORECASE)
text = re.sub("\'ve", " have ", text)
text = re.sub("can't", "can not", text)
text = re.sub("n't", " not ", text)
text = re.sub("i'm", "i am", text, flags=re.IGNORECASE)
text = re.sub("\'re", " are ", text)
text = re.sub("\'d", " would ", text)
text = re.sub("\'ll", " will ", text)
text = re.sub("e\.g\.", " eg ", text, flags=re.IGNORECASE)
text = re.sub("b\.g\.", " bg ", text, flags=re.IGNORECASE)
text = re.sub("(\d+)(kK)", " \g<1>000 ", text)
text = re.sub("e-mail", " email ", text, flags=re.IGNORECASE)
text = re.sub("(the[\s]+|The[\s]+)?U\.S\.A\.", " America ", text, flags=re.IGNORECASE)
text = re.sub("(the[\s]+|The[\s]+)?United State(s)?", " America ", text, flags=re.IGNORECASE)
text = re.sub("\(s\)", " ", text, flags=re.IGNORECASE)
text = re.sub("[c-fC-F]\:\/", " disk ", text)
text = re.sub('(?<=[0-9])\,(?=[0-9])', "", text)
text = re.sub('\$', " dollar ", text)
text = re.sub('\%', " percent ", text)
text = re.sub('\&', " and ", text)
text = re.sub('[^\x00-\x7F]+', pad_str(SPECIAL_TOKENS['non-ascii']), text)
text = re.sub("(?<=[0-9])rs ", " rs ", text, flags=re.IGNORECASE)
text = re.sub(" rs(?=[0-9])", " rs ", text, flags=re.IGNORECASE)
text = re.sub(r" (the[\s]+|The[\s]+)?US(A)? ", " America ", text)
text = re.sub(r" UK ", " England ", text, flags=re.IGNORECASE)
text = re.sub(r" india ", " India ", text)
text = re.sub(r" switzerland ", " Switzerland ", text)
text = re.sub(r" china ", " China ", text)
text = re.sub(r" chinese ", " Chinese ", text)
text = re.sub(r" imrovement ", " improvement ", text, flags=re.IGNORECASE)
text = re.sub(r" intially ", " initially ", text, flags=re.IGNORECASE)
text = re.sub(r" quora ", " Quora ", text, flags=re.IGNORECASE)
text = re.sub(r" dms ", " direct messages ", text, flags=re.IGNORECASE)
text = re.sub(r" demonitization ", " demonetization ", text, flags=re.IGNORECASE)
text = re.sub(r" actived ", " active ", text, flags=re.IGNORECASE)
text = re.sub(r" kms ", " kilometers ", text, flags=re.IGNORECASE)
text = re.sub(r" cs ", " computer science ", text, flags=re.IGNORECASE)
text = re.sub(r" upvote", " up vote", text, flags=re.IGNORECASE)
text = re.sub(r" iPhone ", " phone ", text, flags=re.IGNORECASE)
text = re.sub(r" \0rs ", " rs ", text, flags=re.IGNORECASE)
text = re.sub(r" calender ", " calendar ", text, flags=re.IGNORECASE)
text = re.sub(r" ios ", " operating system ", text, flags=re.IGNORECASE)
text = re.sub(r" gps ", " GPS ", text, flags=re.IGNORECASE)
text = re.sub(r" gst ", " GST ", text, flags=re.IGNORECASE)
text = re.sub(r" programing ", " programming ", text, flags=re.IGNORECASE)
text = re.sub(r" bestfriend ", " best friend ", text, flags=re.IGNORECASE)
text = re.sub(r" dna ", " DNA ", text, flags=re.IGNORECASE)
text = re.sub(r" III ", " 3 ", text)
text = re.sub(r" banglore ", " Banglore ", text, flags=re.IGNORECASE)
text = re.sub(r" J K ", " JK ", text, flags=re.IGNORECASE)
text = re.sub(r" J\.K\. ", " JK ", text, flags=re.IGNORECASE)
text = re.sub('[0-9]+\.[0-9]+', " 87 ", text)
text = ''.join([c for c in text if c not in punctuation]).lower()
return text
text = re.sub('(?<=[0-9])\,(?=[0-9])', "", text)
df['question1'] = df['question1'].apply(clean)
df['question2'] = df['question2'].apply(clean)
df2['q1'] = df2['q1'].apply(clean)
df2['q2'] = df2['q2'].apply(clean)
main =df['is_duplicate'].values
main.shape
(404351,)
vocabularySize = 20000
lstm_out = 200
embed_dim = 128
Rawdata=df['question1'].apply(word_tokenize)
Rawdata2=df['question2'].apply(word_tokenize)
testme = df2['q1'].apply(word_tokenize)
testme2=df2['q2'].apply(word_tokenize)
tokenizer2 = Tokenizer(num_words = vocabularySize )
tokenizer2.fit_on_texts(testme)
tokenizer2.fit_on_texts(testme2)
tokenizer = Tokenizer(num_words = vocabularySize )
tokenizer.fit_on_texts(Rawdata)
tokenizer.fit_on_texts(Rawdata2)
sequences = tokenizer.texts_to_sequences(Rawdata)
sequences2 = tokenizer.texts_to_sequences(Rawdata2)
sequences3 = tokenizer2.texts_to_sequences(testme)
sequences4 = tokenizer2.texts_to_sequences(testme2)
data = pad_sequences(sequences, maxlen=2)
data2 = pad_sequences(sequences2, maxlen=2)
data3 = pad_sequences(sequences3, maxlen=2)
data4 = pad_sequences(sequences4, maxlen=2)
TestInput = np.array([data3,data4])
TestInput = TestInput.reshape(1,2,2)
Input = np.array([data,data2])
Input = Input.reshape(404351,2,2)
#opt = SGD(lr = 0.001, momentum = 0.60)
model = Sequential()
#model.add(Embedding(1, 4,input_length = 2 , dropout = 0.4))
model.add(LSTM((1), input_shape = (2,2), return_sequences=False))
model.add(Activation ('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adagrad', metrics=['accuracy'])
X_train,X_test,y_train,y_test = train_test_split(Input,main,test_size = 0.2,random_state = 4)
Input.shape
(404351, 2, 2)
history = model.fit(X_train,y_train,epochs = 10,validation_data= (X_test,y_test) )
model.save_weights('newoutput2.h5')
训练 323480 个样本,验证 80871 个样本 Epoch 1/10 323480/323480 [=============================] - 27s 83us/step - loss: 0.6931 - acc: 0.6304 - val_loss: 0.6931 - val_acc: 0.6323 Epoch 2/10 323480/323480 [======================= =======] - 24s 73us/步 - 损失:0.6931 - acc: 0.6304 - val_loss: 0.6931 - val_acc: 0.6323 Epoch 3/10 323480/323480 [============= =================] - 23s 71us/步 - 损失:0.6931 - acc: 0.6304 - val_loss: 0.6931 - val_acc: 0.6323 Epoch 4/10 323480/323480 [=== ===========================] - 23s 71us/step - loss: 0.6931 - acc: 0.6304 - val_loss: 0.6931 - val_acc: 0.6323 Epoch 5/10 323480/323480 [===============================] - 23s 72us/步 - 损失:0.6931 - 累积: 0.6304 - val_loss:0.6931 - val_acc:0.6323 Epoch 6/10 323480/323480 [=============================] - 23s 71us/步 - 损耗:0.6931 - acc:0。6304 - val_loss:0.6931 - val_acc:0.6323 纪元 7/10 323480/323480 [==============================] - 23s 71us/step - loss: 0.6931 - acc: 0.6304 - val_loss: 0.6931 - val_acc: 0.6323 Epoch 8/10 323480/323480 [====================== =======] - 25s 76us/步 - 损失:0.6931 - acc: 0.6304 - val_loss: 0.6931 - val_acc: 0.6323 Epoch 9/10 323480/323480 [============= =================] - 25s 78us/步 - 损失:0.6931 - acc: 0.6304 - val_loss: 0.6931 - val_acc: 0.6323 Epoch 10/10 323480/323480 [=== ===========================] - 25s 78us/step - loss: 0.6931 - acc: 0.6304 - val_loss: 0.6931 - val_acc: 0.6323 </p>6323 Epoch 8/10 323480/323480 [==============================] - 25s 76us/步 - 损耗:0.6931 - acc: 0.6304 - val_loss: 0.6931 - val_acc: 0.6323 纪元 9/10 323480/323480 [=============================] - 25s 78us/步 - 损失:0.6931 - acc:0.6304 - val_loss:0.6931 - val_acc:0.6323 Epoch 10/10 323480/323480 [===================== =========] - 25s 78us/step - loss: 0.6931 - acc: 0.6304 - val_loss: 0.6931 - val_acc: 0.6323 </p>6323 Epoch 8/10 323480/323480 [==============================] - 25s 76us/步 - 损耗:0.6931 - acc: 0.6304 - val_loss: 0.6931 - val_acc: 0.6323 纪元 9/10 323480/323480 [=============================] - 25s 78us/步 - 损失:0.6931 - acc:0.6304 - val_loss:0.6931 - val_acc:0.6323 Epoch 10/10 323480/323480 [===================== =========] - 25s 78us/step - loss: 0.6931 - acc: 0.6304 - val_loss: 0.6931 - val_acc: 0.6323 </p>
filename = 'newoutput2.h5'
model.load_weights(filename)
new = model.predict(TestInput)
if new > 0.6:
print("Duplication detected")
else:
print("No duplicate")
new
giving output around 0.6567 but not atall increasing, Please help !!
我需要提高训练的准确性
解决方案
有 4 种方法可以提高深度学习性能:
- 利用数据提高性能。
- 使用算法提高性能。
- 通过算法调整提高性能。
- 通过合奏提高性能。
使用数据提高性能:
- 获取更多数据。
- 发明更多数据。
- 重新调整您的数据。
- 转换您的数据。
- 特征选择
使用算法提高性能
- 抽查算法:也许您选择的算法不是最适合您的问题。
- 重采样方法:你必须知道你的模型有多好。您对模型性能的估计是否可靠?
通过算法调整提高性能
一些关于调整神经网络算法以便从中获得更多收益的想法。
- 诊断。
- 权重初始化。
- 学习率。
- 激活函数。
- 网络拓扑结构。
- 批次和时期。
- 正则化。
- 优化和损失。
- 早停。
通过合奏提高性能
您可能需要考虑的三个一般合奏领域:
- 组合模型。
- 合并视图。
- 堆叠。
查看以下链接了解更多信息: https ://machinelearningmastery.com/improve-deep-learning-performance/
推荐阅读
- java - java中的斐波那契字系列
- elasticsearch - 如何通过处理器限制日志记录级别?
- twitter-bootstrap-3 - Bootstrap 3 到 4 转换,嵌套行不起作用
- javascript - 如何使用 ajax 对正在显示和过滤的记录进行分页
- r - 通过匹配另一列来过滤一列
- javascript - 为什么导入不能与中间变量一起使用?
- php - Laravel 和多队列 (AWS SQS)
- java - 无法从 https://repo.maven.apache.org/maven2 传输 org.apache.maven.plugins:maven-surefire-plugin:pom:2.22.2
- python - 使用函数发布 API - Python
- python - 使用熊猫“to_csv”防止尾随零