首页 > 解决方案 > 连接两个数据框时出现奇怪的python数据框尺寸问题

问题描述

这是我要重现的代码和错误消息。我还打印要连接的数据框的原始内容和形状(使用 hstack),看起来没问题,想知道错误是什么?

from sklearn.model_selection import train_test_split
import pandas as pd
from pandas import DataFrame
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer

big_X = pd.DataFrame({'Tags':['tag_a tag_b tag_c', 'tag_b tag_c', 'tag_b tag_c tag_d', 'tag_e tag_b tag_b tag_a'], 'Age':[20, 21, 19, 18]})
big_Y = pd.DataFrame({'Label':[0, 1, 0, 1]})  

X_train, X_test, y_train, y_test = train_test_split(big_X, big_Y, test_size=0.5)
result_matrix_train = X_train['Age']
result_matrix_test = X_test['Age']

sparse_columns = ['Tags']  
for feature_colunm_name in sparse_columns:
  print('processing feature name: ', feature_colunm_name)
  cv = CountVectorizer(stop_words=None)
  X_train_cv = cv.fit_transform(X_train[feature_colunm_name])
  print ('X_train_cv: ', X_train_cv)
  print ('result_matrix_train: ', result_matrix_train)

  # Merge the vector with others
  if result_matrix_train is not None:
    print (result_matrix_train)
    print (X_train_cv)
    result_matrix_train = hstack((result_matrix_train, X_train_cv))
  else:
    result_matrix_train = X_train_cv

  # Now transform the test data
  X_test_cv = cv.transform(X_test[feature_colunm_name])
  if result_matrix_test is not None:
    result_matrix_test = hstack((result_matrix_test, X_test_cv))
  else:
    result_matrix_test = X_test_cv

错误信息,

     24     print (result_matrix_train)
     25     print (X_train_cv)
---> 26     result_matrix_train = hstack((result_matrix_train, X_train_cv))
     27   else:
     28     result_matrix_train = X_train_cv

    584                                                     exp=brow_lengths[i],
    585                                                     got=A.shape[0]))
--> 586                     raise ValueError(msg)
    587 
    588                 if bcol_lengths[j] == 0:

ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 2, expected 1.

标签: pythondataframe

解决方案


result_matrix_test 的形状是 (2,) 变成 (1,2)。您需要使用 scipy.sparse.csr_matrix.reshape(spar_mat, (-1,1)) 使其成为 (2,1)。

from sklearn.model_selection import train_test_split
import pandas as pd
from pandas import DataFrame
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
import scipy

big_X = pd.DataFrame({'Tags':['tag_a tag_b tag_c', 'tag_b tag_c', 'tag_b tag_c tag_d', 'tag_e tag_b tag_b tag_a'], 'Age':[20, 21, 19, 18]})
big_Y = pd.DataFrame({'Label':[0, 1, 0, 1]})  

X_train, X_test, y_train, y_test = train_test_split(big_X, big_Y, test_size=0.5)
result_matrix_train = X_train['Age']
result_matrix_test = X_test['Age']

feature_colunm_name = "Tags"
cv = CountVectorizer(stop_words=None)
X_train_cv = cv.fit_transform(X_train[feature_colunm_name])

result_matrix_train.shape # (2,)

# explicity convert to csr matrix (your code did this implicitly when calling hstack)
spar_mat = scipy.sparse.csr_matrix(result_matrix_train.values)

# this now has the wrong shape
spar_mat.shape # (1,2)

# reshape this to be (n x 1)
spar_mat_shape = scipy.sparse.csr_matrix.reshape(spar_mat, (-1,1))

# this now has the right shape for hstack
spar_mat_shape.shape # (2, 1)
X_train_cv.shape # (2, 3)

# hstack succeeds
result_matrix_train = hstack((spar_mat_shape, X_train_cv))
result_matrix_train.shape # (2, 4)

# you need to do the same for the "test" portion of your code
result_matrix_test.shape
X_test_cv = cv.transform(X_test[feature_colunm_name])

# result_matrix_test = hstack((result_matrix_test, X_test_cv)) ... this would fail
# this will succeed:
spar_mat_test = scipy.sparse.csr_matrix(result_matrix_test.values)
spar_mat_test_shape = scipy.sparse.csr_matrix.reshape(spar_mat_test, (-1,1))
result_matrix_test = hstack((spar_mat_test_shape, X_test_cv))
result_matrix_test.shape # (2,5)

推荐阅读