tensorflow - 使用棕色数据集(NLTK)的单词相似度的skip-gramm(word2vec)模型的准确性
问题描述
我想根据 NLTK 库中的棕色数据集创建相似度矩阵。问题是损失
tf.reduce_mean(tf.nn.sampled_softmax_loss(weights = softmax_weight, biases = softmax_bias, inputs = embed,
labels = y, num_sampled = num_sampled, num_classes = num_words))
从 4.2 下降到 2.0,然后开始上下波动。问题是:如何提高模型的准确性?
这是我的完整代码:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding,Layer
from tensorflow.keras import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from numpy.random import choice
import random
from itertools import repeat
import tensorflow as tf
import nltk
import re
from nltk.corpus import stopwords
from nltk.corpus import brown
import string
nltk.download('brown')
nltk.download('stopwords')
#Dataset loading and preparation:
dataset = brown.sents()
punct = list(string.punctuation)
punct.append("``")
punct.append("''")
punct.append("--")
stops = set(stopwords.words("english"))
dataset = [[word.lower() for word in sentence if word not in punct and word.lower() not in stops] for sentence in dataset]
#tokenization
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(dataset)
word2index = tokenizer.word_index
index_word = tokenizer.index_word
total_words = 5000
data_prep = tokenizer.texts_to_sequences(dataset)
data_prep = [sentence for sentence in data_prep if len(sentence) >2]
#word2vec
def word2vec_preparation(data,window_size,num_skips):
grams = []
context = []
target = []
assert window_size >= 1,'windows_size argument is <1!'
assert num_skips >= 1,'num_skips argument <1!'
for sentence in data:
if len(sentence) - window_size > 1:
#print(sentence)
for i in range(len(sentence)):
if i - window_size < 0:
gram = sentence[i+1:i+window_size + 1]
check = num_skips - len(set(gram))
#print(gram)
grams.append(gram)
if check > 0:
context.extend(random.sample(set(gram), len(set(gram))))
target.extend(repeat(sentence[i], len(set(gram))))
else:
context.extend(random.sample(set(gram), num_skips))
target.extend(repeat(sentence[i], num_skips))
elif i + window_size > len(sentence) -1:
gram = sentence[i-window_size:i]
check = num_skips - len(set(gram))
#print(gram)
grams.append(gram)
if check > 0:
context.extend(random.sample(set(gram), len(set(gram))))
target.extend(repeat(sentence[i], len(set(gram))))
else:
context.extend(random.sample(set(gram), num_skips))
target.extend(repeat(sentence[i], num_skips))
else:
gram = sentence[i-window_size:i] + sentence[i+1:i+window_size + 1]
check = num_skips - len(set(gram))
#print(gram)
grams.append(gram)
if check > 0:
context.extend(random.sample(set(gram), len(set(gram))))
target.extend(repeat(sentence[i], len(set(gram))))
else:
context.extend(random.sample(set(gram), num_skips))
target.extend(repeat(sentence[i], num_skips))
#print('----------------------')
return grams, context, target
grams,context,target = word2vec_preparation(data_prep,window_size = 2,num_skips = 3)
target = np.array(target,dtype= np.int64)
context = np.array(context,dtype= np.int64)
context = context.reshape(len(context),1)
dataset_train = tf.data.Dataset.from_tensor_slices((target, context))
dataset_train = dataset_train.shuffle(buffer_size=1024).batch(64)
#Parameters:
num_words = 5000
embed_size = 300
num_sampled = 64
initializer_softmax = tf.keras.initializers.GlorotUniform()
#Variables:
embeddings_weight = tf.Variable(tf.random.uniform([num_words,embed_size],-1.0,1.0))
softmax_weight = tf.Variable(initializer_softmax([num_words,embed_size]))
softmax_bias = tf.Variable(initializer_softmax([num_words]))
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
@tf.function
def training(X,y):
with tf.GradientTape() as tape:
embed = tf.nn.embedding_lookup(embeddings_weight,X)#embeddings_weight are parameters and X is a collection of indecies for looking up in the embedding table
loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights = softmax_weight, biases = softmax_bias, inputs = embed,
labels = y, num_sampled = num_sampled, num_classes = num_words))
variables = [embeddings_weight,softmax_weight,softmax_bias]
gradients = tape.gradient(loss,variables)
optimizer.apply_gradients(zip(gradients,variables))
return loss
#tf.print('Loss:',loss)
EPOCHS = 100
for epoch in range(EPOCHS):
for step, (X,y) in enumerate(dataset_train):
loss = training(X,y)
tf.print('Epoch:',epoch + 1, 'loss:',loss)
解决方案
报告的损失不是模型有用性的黄金标准——根据实际用途对生成的词向量进行实际测试才是。
而且,具有损失平台(然后上下抖动)是这种优化的自然和预期行为。(该模型在预测训练数据方面永远不会完美,除非它对于数据来说过大——在这种情况下,它会在训练数据上“过度拟合”,并且可能在实际任务中表现不佳。)你希望每个模型的损失低至它可以,给定特定的算法和模型参数——而不是 0.0。
您的代码中可能还有其他错误,我尚未查看。我建议使用现成的、经过调试的Word2Vec
实现——或者直接使用,或者,如果它真的很重要(可能出于学习目的)使用你自己的实现,作为判断你的代码是否工作的基准。
推荐阅读
- uart - 如何使用 Atmega8 做硬件 UART
- mysql - 如何使用环境变量通过一个 docker-compose.yml 部署多个数据库?
- angular - 当名称更改每次构建时,Angular pm2 如何加载 outpudHashed main.8646asdasdas6d8468.js 文件?
- kubernetes - Kubernetes 的工作节点上没有临时存储资源
- python - 一个文件中的字典或数据框未从另一个文件更新
- office-js - 使用 OfficeJS 的 Excel 桌面版中的 Office 对话框 API 问题
- azure-application-insights - 跨多个应用程序见解的 Azure Query,同时联合所有不同类型
- python - 有一个创建新命令 discord.py 的命令
- reactjs - State React 中没有价值
- java - 如何将 UriComponentsBuilder 中的 .csv 文件作为查询参数传递