python - 如何使用 GloVe 生成向量矩阵?
问题描述
我正在使用HDBSCAN算法从我拥有的文档中创建集群。但是要从单词中创建一个向量矩阵,我正在使用tf-idf算法并想使用GloVe。我搜索了帖子,但不明白如何使用这个算法。我还阅读了有关Gensim 的信息,但不明白如何使用它来实现GloVe
. 这是我正在做的事情:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3
import csv
import string
import time
import sys
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.externals import joblib
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import hdbscan
csvRows = []
nltk.download('stopwords')
title = []
synopses = []
filename = "twitter-test-dataset.csv"
num_clusters = 10
pkl_file = "doc_cluster.pkl"
generate_pkl = False
# pre-process data
with open(filename, 'r') as csvfile:
# creating a csv reader object
csvreader = csv.reader(csvfile)
# extracting field names through first row
fields = csvreader.next()
# extracting each data row one by one
duplicates = 0
for row in csvreader:
# removes the characters specified
line = re.sub(r'[.,"!]+', '', row[2], flags=re.MULTILINE)
line = re.sub(r'^RT[\s]+', '', line, flags=re.MULTILINE) # removes RT
line = re.sub(r'https?:\/\/.*[\r\n]*', '',
line, flags=re.MULTILINE) # remove link
line = re.sub(r'[:]+', '', line, flags=re.MULTILINE)
line = (re.sub(
"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", line, flags=re.MULTILINE))
line = filter(lambda x: x in string.printable,
line) # filter non-ascii characers
if line not in synopses:
synopses.append(line)
title.append(row[2])
else:
duplicates += 1
print("Removed " + str(duplicates) + " rows")
stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer("english")
def tokenize_and_stem(text):
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
tokens = [word for sent in nltk.sent_tokenize(
text) for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
stems = [stemmer.stem(t) for t in filtered_tokens]
return stems
def tokenize_only(text):
# first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
tokens = [word.lower() for sent in nltk.sent_tokenize(text)
for word in nltk.word_tokenize(sent)]
filtered_tokens = []
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
for token in tokens:
if re.search('[a-zA-Z]', token):
filtered_tokens.append(token)
return filtered_tokens
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in synopses:
# for each item in 'synopses', tokenize/stem
allwords_stemmed = tokenize_and_stem(i)
# extend the 'totalvocab_stemmed' list
totalvocab_stemmed.extend(allwords_stemmed)
allwords_tokenized = tokenize_only(i)
totalvocab_tokenized.extend(allwords_tokenized)
vocab_frame = pd.DataFrame(
{'words': totalvocab_tokenized}, index=totalvocab_stemmed)
# print "there are " + str(vocab_frame.shape[0]) + " items in vocab_frame"
# define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
min_df=0.0, stop_words='english',
use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1, 3))
#CREATE TFIDF MATRIX
tfidf_matrix = tfidf_vectorizer.fit_transform(synopses)
terms = tfidf_vectorizer.get_feature_names()
c = hdbscan.HDBSCAN(min_cluster_size=5)
#PASS TFIDF_MATRIX TO HDBSCAN
c.fit(tfidf_matrix)
print(c.labels_)
sys.exit()
正如您在上面的实现中所看到的,我已经将HDBSCAN
withtf-idf
用于文本聚类。我怎么能GloVe
代替tf-idf
呢?
解决方案
通常的做法似乎是对文档中的每个单词使用所有手套向量的平均值。
我不相信这一点。理论支持似乎很脆弱,您实际上可以添加或平均这些向量,因为这会破坏角度。
推荐阅读
- angular - 将打字稿库添加到角度
- spring-boot - Spring Boot Security 错误:编码密码看起来不像 BCrypt,从内存认证转换为数据库认证
- python - 词袋标记每个字母而不是单词
- python - 尝试在 python 中为文本文件中的数量级制作直方图
- javascript - 如何在 vue.js 上接收 HTTP POST 参数?
- prestashop-1.7 - 在产品页面点击保存按钮后挂钩
- javascript - 将两个数组变成json数据
- dialogflow-es - 从我的 Actions on Google 链接到 Web url
- linux - 将 tee 与 bash 一起使用会导致一些脚本挂起
- apache-spark - Apache PySpark - 获取最新记录问题