首页 > 解决方案 > 使用棕色数据集(NLTK)的单词相似度的skip-gramm(word2vec)模型的准确性

问题描述

我想根据 NLTK 库中的棕色数据集创建相似度矩阵。问题是损失

tf.reduce_mean(tf.nn.sampled_softmax_loss(weights = softmax_weight, biases = softmax_bias, inputs = embed,
                  labels = y, num_sampled = num_sampled, num_classes = num_words))

从 4.2 下降到 2.0,然后开始上下波动。问题是:如何提高模型的准确性?

这是我的完整代码:

import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding,Layer
from tensorflow.keras import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from numpy.random import choice
import random
from itertools import repeat
import tensorflow as tf
import nltk
import re
from nltk.corpus import stopwords
from nltk.corpus import brown
import string
nltk.download('brown')
nltk.download('stopwords')


#Dataset loading and preparation:
dataset = brown.sents()

punct = list(string.punctuation)
punct.append("``")
punct.append("''")
punct.append("--")
stops = set(stopwords.words("english")) 

dataset = [[word.lower() for word in sentence if word not in punct and word.lower() not in stops] for sentence in dataset] 


#tokenization
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(dataset)

word2index = tokenizer.word_index
index_word = tokenizer.index_word

total_words = 5000

data_prep = tokenizer.texts_to_sequences(dataset) 
data_prep = [sentence for sentence in data_prep if len(sentence) >2] 

#word2vec
def word2vec_preparation(data,window_size,num_skips):
    grams = []
    context = []
    target = []

    assert window_size >= 1,'windows_size argument is <1!'
    assert num_skips >= 1,'num_skips argument <1!'
    for sentence in data:
        if len(sentence) - window_size > 1:
            #print(sentence)

            for i in range(len(sentence)):
                if i - window_size < 0:
                    gram = sentence[i+1:i+window_size + 1]
                    check = num_skips - len(set(gram))
                    #print(gram)
                    grams.append(gram)
                    if check > 0:
                        context.extend(random.sample(set(gram), len(set(gram))))
                        target.extend(repeat(sentence[i], len(set(gram))))
                    else:
                        context.extend(random.sample(set(gram), num_skips))
                        target.extend(repeat(sentence[i], num_skips))

                elif i + window_size > len(sentence) -1:
                    gram = sentence[i-window_size:i]
                    check = num_skips - len(set(gram))
                    #print(gram)
                    grams.append(gram)
                    if check > 0:
                        context.extend(random.sample(set(gram), len(set(gram))))
                        target.extend(repeat(sentence[i], len(set(gram))))
                    else:
                        context.extend(random.sample(set(gram), num_skips))
                        target.extend(repeat(sentence[i], num_skips))

                else:
                    gram = sentence[i-window_size:i] + sentence[i+1:i+window_size + 1]
                    check = num_skips - len(set(gram))
                    #print(gram)
                    grams.append(gram)
                    if check > 0:
                        context.extend(random.sample(set(gram), len(set(gram))))
                        target.extend(repeat(sentence[i], len(set(gram))))
                    else:
                        context.extend(random.sample(set(gram), num_skips))
                        target.extend(repeat(sentence[i], num_skips))

        #print('----------------------')

    return grams, context, target

grams,context,target = word2vec_preparation(data_prep,window_size = 2,num_skips = 3)

target = np.array(target,dtype= np.int64)
context = np.array(context,dtype= np.int64)


context = context.reshape(len(context),1)
dataset_train = tf.data.Dataset.from_tensor_slices((target, context))
dataset_train = dataset_train.shuffle(buffer_size=1024).batch(64)

#Parameters:
num_words = 5000
embed_size = 300
num_sampled = 64
initializer_softmax = tf.keras.initializers.GlorotUniform()
#Variables:
embeddings_weight = tf.Variable(tf.random.uniform([num_words,embed_size],-1.0,1.0))
softmax_weight = tf.Variable(initializer_softmax([num_words,embed_size]))
softmax_bias = tf.Variable(initializer_softmax([num_words]))

optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)

@tf.function
def training(X,y):
  with tf.GradientTape() as tape:
    embed = tf.nn.embedding_lookup(embeddings_weight,X)#embeddings_weight are parameters and X is a collection of indecies for looking up in the embedding table
    loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(weights = softmax_weight, biases = softmax_bias, inputs = embed,
                  labels = y, num_sampled = num_sampled, num_classes = num_words))
  variables = [embeddings_weight,softmax_weight,softmax_bias]  
  gradients = tape.gradient(loss,variables)
  optimizer.apply_gradients(zip(gradients,variables))
  return loss
  #tf.print('Loss:',loss)



EPOCHS = 100

for epoch in range(EPOCHS):
  for step, (X,y) in enumerate(dataset_train):
    loss = training(X,y)
  tf.print('Epoch:',epoch + 1, 'loss:',loss)

标签: tensorflowdeep-learningword2vec

解决方案


报告的损失不是模型有用性的黄金标准——根据实际用途对生成的词向量进行实际测试才是。

而且,具有损失平台(然后上下抖动)是这种优化的自然和预期行为。(该模型在预测训练数据方面永远不会完美,除非它对于数据来说过大——在这种情况下,它会在训练数据上“过度拟合”,并且可能在实际任务中表现不佳。)你希望每个模型的损失低至它可以,给定特定的算法和模型参数——而不是 0.0。

您的代码中可能还有其他错误,我尚未查看。我建议使用现成的、经过调试的Word2Vec实现——或者直接使用,或者,如果它真的很重要(可能出于学习目的)使用你自己的实现,作为判断你的代码是否工作的基准。


推荐阅读