python - 即使我将 max_feature 设置为 500 并浮动到 32bit,nnz 也太大了
问题描述
我想找出电视剧之间的相似之处
from sklearn.feature_extraction.text import TfidfVectorizer
#define a tfidf vectorizer object and remove all english stop words such as 'the','a'
tfidf=TfidfVectorizer(stop_words='english', max_features=500)
#construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix=tfidf.fit_transform(merged_data['name'])
# output the shape of the tfidf_matrix
tfidf_matrix.shape
tfidf_martix 的形状是 (6472256, 500)
from sklearn.metrics.pairwise import cosine_similarity,linear_kernel,sigmoid_kernel
# we use linear kernel similarity since is faster than cosine similarity
#computing cosine similarity matrix
cosine_sim=cosine_similarity(tfidf_matrix,tfidf_matrix)
from sparse_dot_mkl import dot_product_mkl
result = dot_product_mkl(random_tc, random_p)
但我收到错误消息
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-78-1e4fbfd6e689> in <module>
5 #computing cosine similarity matrix
6
----> 7 cosine_sim=cosine_similarity(tfidf_matrix,tfidf_matrix)
8 from sparse_dot_mkl import dot_product_mkl
9
/opt/conda/lib/python3.7/site-packages/sklearn/metrics/pairwise.py in cosine_similarity(X, Y, dense_output)
1187
1188 K = safe_sparse_dot(X_normalized, Y_normalized.T,
-> 1189 dense_output=dense_output)
1190
1191 return K
/opt/conda/lib/python3.7/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
/opt/conda/lib/python3.7/site-packages/sklearn/utils/extmath.py in safe_sparse_dot(a, b, dense_output)
150 ret = np.dot(a, b)
151 else:
--> 152 ret = a @ b
153
154 if (sparse.issparse(a) and sparse.issparse(b)
/opt/conda/lib/python3.7/site-packages/scipy/sparse/base.py in __matmul__(self, other)
558 raise ValueError("Scalar operands are not allowed, "
559 "use '*' instead")
--> 560 return self.__mul__(other)
561
562 def __rmatmul__(self, other):
/opt/conda/lib/python3.7/site-packages/scipy/sparse/base.py in __mul__(self, other)
478 if self.shape[1] != other.shape[0]:
479 raise ValueError('dimension mismatch')
--> 480 return self._mul_sparse_matrix(other)
481
482 # If it's a list or whatever, treat it like a matrix
/opt/conda/lib/python3.7/site-packages/scipy/sparse/compressed.py in _mul_sparse_matrix(self, other)
507 np.asarray(self.indices, dtype=idx_dtype),
508 np.asarray(other.indptr, dtype=idx_dtype),
--> 509 np.asarray(other.indices, dtype=idx_dtype))
510
511 idx_dtype = get_index_dtype((self.indptr, self.indices,
RuntimeError: nnz of the result is too large
因此,我尝试将数据从 float64 更改为 float 32
tfidf_matrix=tfidf_matrix.astype(np.float32)
但是我仍然收到错误消息。我应该减少 tfidf_matrix 的数量吗?我可以减少多少?我现在正在使用 Google Colab/kaggle Notebook 和 Jupyter Notebook 并尝试找到一个可以运行它的平台,以便我可以找到评级预测。
解决方案
推荐阅读
- windows - 重新访问 NASM 64 中 Windows API 中的 CreateFileA:无效参数
- r - R:通过删除和添加循环变量来迭代向量
- java - 尝试将文件从一个文件夹复制到 JAVA 中的临时文件夹时出错 [拒绝访问]?
- javascript - 如何使用异步json2xlsx
- google-chrome - ¿如何在没有让用户下载文档的工具栏的情况下在 primefaces 或 html 组件上显示 pdf?仅查看 PDF
- wordpress - 尝试在 wp-cli 上使用 woocommerce 命令 - 获取“wc”不是已注册的 wp 命令。
- java - 我想从访问表中获取日期并检查它是否是 Eclipse 中的今天日期,但它给了我错误
- marklogic - Cloudformation 模板 - 替换集群、保留卷(和 VPC)
- permissions - 邮箱权限自动更改 whm
- r - 在 SQL 查询列上运行 While 循环(需要使用 sqldf 完成)