python - 连接两个数据框时出现奇怪的python数据框尺寸问题
问题描述
这是我要重现的代码和错误消息。我还打印要连接的数据框的原始内容和形状(使用 hstack),看起来没问题,想知道错误是什么?
from sklearn.model_selection import train_test_split
import pandas as pd
from pandas import DataFrame
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
big_X = pd.DataFrame({'Tags':['tag_a tag_b tag_c', 'tag_b tag_c', 'tag_b tag_c tag_d', 'tag_e tag_b tag_b tag_a'], 'Age':[20, 21, 19, 18]})
big_Y = pd.DataFrame({'Label':[0, 1, 0, 1]})
X_train, X_test, y_train, y_test = train_test_split(big_X, big_Y, test_size=0.5)
result_matrix_train = X_train['Age']
result_matrix_test = X_test['Age']
sparse_columns = ['Tags']
for feature_colunm_name in sparse_columns:
print('processing feature name: ', feature_colunm_name)
cv = CountVectorizer(stop_words=None)
X_train_cv = cv.fit_transform(X_train[feature_colunm_name])
print ('X_train_cv: ', X_train_cv)
print ('result_matrix_train: ', result_matrix_train)
# Merge the vector with others
if result_matrix_train is not None:
print (result_matrix_train)
print (X_train_cv)
result_matrix_train = hstack((result_matrix_train, X_train_cv))
else:
result_matrix_train = X_train_cv
# Now transform the test data
X_test_cv = cv.transform(X_test[feature_colunm_name])
if result_matrix_test is not None:
result_matrix_test = hstack((result_matrix_test, X_test_cv))
else:
result_matrix_test = X_test_cv
错误信息,
24 print (result_matrix_train)
25 print (X_train_cv)
---> 26 result_matrix_train = hstack((result_matrix_train, X_train_cv))
27 else:
28 result_matrix_train = X_train_cv
584 exp=brow_lengths[i],
585 got=A.shape[0]))
--> 586 raise ValueError(msg)
587
588 if bcol_lengths[j] == 0:
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 2, expected 1.
解决方案
result_matrix_test 的形状是 (2,) 变成 (1,2)。您需要使用 scipy.sparse.csr_matrix.reshape(spar_mat, (-1,1)) 使其成为 (2,1)。
from sklearn.model_selection import train_test_split
import pandas as pd
from pandas import DataFrame
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
import scipy
big_X = pd.DataFrame({'Tags':['tag_a tag_b tag_c', 'tag_b tag_c', 'tag_b tag_c tag_d', 'tag_e tag_b tag_b tag_a'], 'Age':[20, 21, 19, 18]})
big_Y = pd.DataFrame({'Label':[0, 1, 0, 1]})
X_train, X_test, y_train, y_test = train_test_split(big_X, big_Y, test_size=0.5)
result_matrix_train = X_train['Age']
result_matrix_test = X_test['Age']
feature_colunm_name = "Tags"
cv = CountVectorizer(stop_words=None)
X_train_cv = cv.fit_transform(X_train[feature_colunm_name])
result_matrix_train.shape # (2,)
# explicity convert to csr matrix (your code did this implicitly when calling hstack)
spar_mat = scipy.sparse.csr_matrix(result_matrix_train.values)
# this now has the wrong shape
spar_mat.shape # (1,2)
# reshape this to be (n x 1)
spar_mat_shape = scipy.sparse.csr_matrix.reshape(spar_mat, (-1,1))
# this now has the right shape for hstack
spar_mat_shape.shape # (2, 1)
X_train_cv.shape # (2, 3)
# hstack succeeds
result_matrix_train = hstack((spar_mat_shape, X_train_cv))
result_matrix_train.shape # (2, 4)
# you need to do the same for the "test" portion of your code
result_matrix_test.shape
X_test_cv = cv.transform(X_test[feature_colunm_name])
# result_matrix_test = hstack((result_matrix_test, X_test_cv)) ... this would fail
# this will succeed:
spar_mat_test = scipy.sparse.csr_matrix(result_matrix_test.values)
spar_mat_test_shape = scipy.sparse.csr_matrix.reshape(spar_mat_test, (-1,1))
result_matrix_test = hstack((spar_mat_test_shape, X_test_cv))
result_matrix_test.shape # (2,5)
推荐阅读
- r - 读取 csv 文件的特定行及其以下转换
- python - 创建自定义数组
- hyperledger-fabric - 即使获得所有组织的批准也无法提交链码
- c# - 使用 autofac 注入两个相同类型的对象
- python - 获取重复样本并将其添加到新的数据框:Python Pandas
- statistics - 使用控制变量创建散点图
- ios - 地图套件。如何动态设置缩放?
- angular - HttpClient post返回true,但组件订阅函数为false
- javascript - 通过 Vanilla Javascript 以模态加载 JSON 数据
- powerbi - 如何从 Power Query M(Power BI)的另一个表中的特定 clumn 中获取最小值?