python - 在 keras 中使用神经网络进行文本分类 - 模型很弱
问题描述
我正在尝试对圣经中的经文进行分类,问题是我的模型不好,我找不到改进它的方法。
这是我的代码:
import tensorflow.keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.layers import MaxPooling2D,Conv2D
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import SpatialDropout1D
from sklearn.model_selection import train_test_split
from tensorflow.keras import regularizers
import pandas as pd
import numpy as np
data = pd.read_csv("bible_data_set (with count and testament).csv")
data
import nltk
from nltk.stem import PorterStemmer
ps = PorterStemmer()
vocabulary_size = 0
word2location = {}
def prepare_vocabulary(data):
index = 0
for sentance in data['text']:
#sentance = sentance.lower()
words = nltk.word_tokenize(sentance)
for word in words:
stemed_word = ps.stem(word)
if stemed_word not in word2location:
word2location[stemed_word] = index
index += 1
return index
def convert2vec(sentance):
#sentance = sentance.lower()
res_vec = np.zeros(vocabulary_size)
words = nltk.word_tokenize(sentance)
for word in words:
stemed_word = ps.stem(word)
if stemed_word in word2location:
res_vec[word2location[stemed_word]]+=1
return res_vec
books = ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy', 'Joshua', 'Judges',
'Ruth', '1 Samuel', '2 Samuel', '1 Kings', '2 Kings', '1 Chronicles', '2 Chronicles',
'Ezra', 'Nehemiah', 'Esther', 'Job', 'Psalms', 'Proverbs', 'Ecclesiastes',
'Song of Solomon', 'Isaiah', 'Jeremiah', 'Lamentations', 'Ezekiel', 'Daniel',
'Hosea', 'Joel', 'Amos', 'Obadiah', 'Jonah', 'Micah', 'Nahum', 'Habakkuk',
'Zephaniah', 'Haggai', 'Zechariah', 'Malachi', 'Matthew', 'Mark', 'Luke', 'John', 'Acts', 'Romans', '1 Corinthians',
'2 Corinthians', 'Galatians', 'Ephesians', 'Philippians', 'Colossians',
'1 Thessalonians', '2 Thessalonians', '1 Timothy', '2 Timothy', 'Titus', 'Philemon',
'Hebrews', 'James', '1 Peter', '2 Peter', '1 John', '2 John', '3 John', 'Jude',
'Revelation']
def encode(line):
res_vec = np.zeros(66)
idx = books.index(data.iloc[line]['book'])
res_vec[idx] = 1
return res_vec
vocabulary_size = prepare_vocabulary(data)
print("the size of the vocabulary is: ", vocabulary_size)
word2location
import random
rand = []
for r in range (4500):
ra = random.randrange(0, 31101)
if(ra not in rand):
rand.append(ra)
train_x = []
train_y = []
test_x = []
test_y = []
for i in range(len(data['text'])):
if(i not in rand):
train_x.append(i)
train_y.append(i)
elif(i in rand):
test_x.append(i)
test_y.append(i)
data_x = np.array([convert2vec(data.iloc[i]['text']) for i in train_x])
np.random.shuffle(data_x)
data_y = np.array([encode(i) for i in train_y])
np.random.shuffle(data_y)
test_data_x = np.array([convert2vec(data.iloc[i]['text']) for i in test_x])
np.random.shuffle(test_data_x)
test_data_y = np.array([encode(i) for i in test_y])
np.random.shuffle(test_data_y)
model = Sequential()
model.add(Dense(128, activation = 'sigmoid', input_dim = vocabulary_size))
model.add(Dropout(0.1))
model.add(Dense(128, activation = 'sigmoid'))
model.add(Dropout(0.1))
model.add(Dense(66, activation = 'softmax'))
opt = SGD(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
history = model.fit(data_x, data_y, epochs=50, batch_size=16,validation_data=(test_data_x,test_data_y),callbacks=[EarlyStopping(monitor='val_loss', patience=5, min_delta=0.00001)])
我一直在过度拟合或拟合不足。我已经尝试对密集进行 relu 激活,并更改了损失函数和优化器,但没有任何帮助。有什么我想念的吗?
解决方案
这里
data_x = np.array([convert2vec(data.iloc[i]['text']) for i in train_x])
np.random.shuffle(data_x)
data_y = np.array([encode(i) for i in train_y])
np.random.shuffle(data_y)
test_data_x = np.array([convert2vec(data.iloc[i]['text']) for i in test_x])
np.random.shuffle(test_data_x)
test_data_y = np.array([encode(i) for i in test_y])
np.random.shuffle(test_data_y)
您为火车数据 (data_x) 调用了 np.random.shuffle,为火车标签 (data_y) 调用了 np.random.shuffle。这不应该是正确的,因为您的功能应该与您的标签保持配对。只需将它们配对并随机洗牌一次,然后进行相同的测试即可。
推荐阅读
- javascript - IE 上的命名参数
- r - Travis CI 问题 - 无法检索 gpg 密钥
- angular - 如何降级 Angular 项目?
- android - 我收到错误,因为图片显示我找不到任何使用 react native 的解决方案 react native
- json - jq在powershell下用-提取密钥
- jira - Jira 是否使用基于令牌的身份验证?
- c++ - 使用默认参数定义友元函数的原因
- c# - 发布图像时“等待 httpClient.PostAsync”上的对象处置异常
- c# - Asp.net 下拉列表 - Listitem - 资源文件 - 值
- apache-camel - 使用bindy将csv转换为xml