首页 > 解决方案 > 文本聚类:质心列表中的重复项

问题描述

在查看使用 KMeans 制作的集群的质心时,我在这个质心列表中发现了重复项。

那是什么意思广告我怎样才能摆脱重复?

这可能是我有一个“黑洞”集群的原因,它包含所有数据集的 30-40%,因为我有大约 40-80 个集群,所以数量巨大?你能给我一些提示如何使集群更平衡吗?

在此处输入图像描述

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

**Preprocessing and transforming DataFrame to list of text strings**
import re
corpus = []
for i in df["text"]:
    y = re.sub(r'[^(\w|\s)]',' ',i)
    y = re.sub(r'[\-]', ' ', y) 
    y = re.sub(r'[a-z]', '', y)
    y = re.sub(r'[A-Z]', '', y)
    y = re.sub(r'\n',' ',y)
    y = re.sub(r'[\d]',' ',y)
    y = re.sub(r'[(\(|\)]','',y)
    y = y.lower()
    corpus.append(y)
# corpus - list of preprocessed strings



# PyMyStem - lemmatizer for Russian
!wget http://download.cdn.yandex.net/mystem/mystem-3.0-linux3.1-64bit.tar.gz
!tar -xvf mystem-3.0-linux3.1-64bit.tar.gz
!cp mystem /root/.local/bin/mystem

from pymystem3 import Mystem 
m = Mystem() 
corp_lemmz = []
stop_words = [LIST OF STOP WORDS]

for row in corpus:
    lemm = m.lemmatize(row)
    for word in lemm:
        if word in stop_words:
            lemm.remove(word)
    lemmas = "".join(lemm)
    lemmas = re.sub(r'\n',' ',lemmas) 
    corp_lemmz.append(lemmas)
#corp_lemmz - list of preprocessed strings

////////////////////
Vectorizing of preprocessed strings

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer=TfidfVectorizer(use_idf=True, analyzer = 'word', stop_words=stop_words) 
tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(corp_lemmz)

///// Trying to count duplicates but receive 0 in counter
counter = 0
count_list = []
for i in tfidf_vectorizer.get_feature_names():
    if not i in count_list:
        count_list.append(i)
    else:    
        counter +=1
print(counter)

/////////

print("Top terms per cluster:")
n_clusters = 80
mbk = KMeans(n_clusters=n_clusters, init = "k-means++", random_state=20)
mbk.fit(tfidf_vectorizer_vectors)
order_centroids = mbk.cluster_centers_.argsort()[:, ::-1]
labels = mbk.labels_
terms = tfidf_vectorizer.get_feature_names()

for i in range(n_clusters):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
        print()

///////Count duplicates
import collections
c = collections.Counter()
for i in range(n_clusters):
    for ind in order_centroids[i, :10]:
        if ind not in c:
            c[ind] = 1
        else:
            c[ind] += 1

c.most_common()

答案:[(981, 27), (982, 26), (983, 22), (980, 21), (2938, 20) ...

#matching unpreprocessed initial sentence with its cluster label
match = {}
for i in range(len(labels)):
    match[corpus[i]] = labels[i]

a = sorted(match.items(), key=lambda x: x[1])    

## Creating a dict to store sentences with cluster label instead of list
output = {} 
for x, y in a: 
  if y in output: 
    output[y].append((x)) 
  else: 
    output[y] = [(x)] 

#importing result to txt-files
with open("sentence_plus_cluster_80_idf_True_Predicted_MiniBatch.txt", "w") as file:    
  for i in range(len(output)):
      if output.get(i) != None:
          file.write("Cluster" + str(i) +  '\n' + 'Len of clust' + str(len(output.get(i))) + '\n')
          for x in output.get(i):
              file.write(x + '\n')
with open("top_words_per_clust_80_idf_True_PRedicted_MiniBatch.txt", "w") as file:    
  file.write("KMeans") # %d" % n_clusters + '\n')  
  for i in range(n_clusters):
      print("Cluster %d:" % i, end='')
      file.write("Cluster %d:" % i) 
      for ind in order_centroids[i, :20]:
          print(' %s' % terms[ind], end='')
          print()
          file.write(terms[ind] + '\n')

标签: pythonduplicatescluster-analysisk-meanscentroid

解决方案


推荐阅读