python - Sklearn - 发现样本数量不一致的输入变量:[16512, 4128]
问题描述
通过 Scikit-Learn 和 TensorFlow 进行动手机器学习的第二章,运行到上述错误。当我尝试实现以下行时会发生这种情况:
linReg.fit(housingPrepared, housing_labels)
在线研究它似乎必须对我的特征尺寸和我的标签不匹配做一些事情。打印housingPrepared (X) 和housing_labels (Y) 的形状会产生以下结果:
(16512, 16) (4128,)
我花了最后一个小时逐行浏览,看看我是否错过了本章中的一行,找不到任何东西。想知道这里的人是否对这个问题的潜在解决方案有直觉。
非常感谢你。我在问题行之前的所有代码都发布在下面:
import os
import tarfile
from six.moves import urllib
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from zlib import crc32
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import Imputer, OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from CategoricalEncoder import CategoricalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.utils.validation import check_array
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets","housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
def fetchHousingData(housingUrl=HOUSING_URL, housingPath=HOUSING_PATH):
if not os.path.isdir(housingPath):
os.makedirs(housingPath)
tgzPath = os.path.join(housingPath, "housing.tgz")
urllib.request.urlretrieve(housingUrl, tgzPath)
housingTgz = tarfile.open(tgzPath)
housingTgz.extractall(path=housingPath)
housingTgz.close()
def loadHousingData(housingPath=HOUSING_PATH):
return pd.read_csv("https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv")
housing = loadHousingData()
#plt.hist(housing['longitude'],bins=50)
#plt.show()
def splitTrainTesT(data, testRatio):
shuffled_indices = np.random.permutation(len(data))
testSetSize = int(len(data)* testRatio)
testIndices = shuffled_indices[:testSetSize]
trainIndices = shuffled_indices[testSetSize:]
return data.iloc[trainIndices], data.iloc[testIndices]
def testSetCheck(identifier, testRatio):
return crc32(np.int64(identifier)) & 0xffffffff < testRatio * 2 ** 32
def splitTrainTestByID(data, testRatio, idColumn):
ids = data[idColumn]
inTestSet = ids.apply(lambda id_: testSetCheck(id_, testRatio))
return data.loc[~inTestSet], data.loc[inTestSet]
#housingWithID = housing.reset_index()
#trainSet, testSet = splitTrainTestByID(housingWithID,0.2,"index")
trainSet, testSet = train_test_split(housing,test_size=0.2,random_state=42)
housing["income_cat"] = np.ceil(housing["median_income"]/1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
#plt.hist(housing["income_cat"])
#plt.show()
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for trainIndex, testIndex in split.split(housing, housing["income_cat"]):
stratTrainSet = housing.loc[trainIndex]
stratTestSet = housing.loc[testIndex]
for set in (stratTrainSet, stratTestSet):
set.drop("income_cat", axis=1, inplace=True)
housing = stratTrainSet.copy()
#print(housing)
#plt.scatter(x=housing["latitude"],y=housing["longitude"], alpha=0.4)
#plt.show()
corr_matrix = housing.corr()
#print(corr_matrix["median_house_value"].sort_values(ascending=False))
#attribues = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
#scatter_matrix(housing[attribues], figsize=(12,8))
#plt.show()
""" PREPARING DATA FOR MACHINE LEARNING ALGORITHMS"""
housing = stratTrainSet.drop("median_house_value", axis=1)
housing_labels = stratTestSet["median_house_value"].copy()
housing.dropna(subset=["total_bedrooms"])
imputer = Imputer(strategy="median")
housingNum = housing.drop("ocean_proximity", axis=1)
imputer.fit(housingNum)
X = imputer.transform(housingNum)
housingTr = pd.DataFrame(X, columns=housingNum.columns)
housingCat = housing["ocean_proximity"]
housingCatEncoded, housingCategories = housingCat.factorize()
encoder = OneHotEncoder()
housingCat1Hot = encoder.fit_transform(housingCatEncoded.reshape(-1,1))
"""Custom Transformers For Rooms Per Household, etc"""
roomsIX, bedroomsIX, populationIX, householdsIX = 3,4,5,6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, addBedroomsPerRoom = True):
self.addBedroomsPerRoom = addBedroomsPerRoom
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
roomsPerHousehold = X[:,roomsIX]/X[:,householdsIX]
populationPerHousehold = X[:,populationIX]/X[:,householdsIX]
if self.addBedroomsPerRoom:
bedroomsPerRoom = X[:,bedroomsIX]/X[:,roomsIX]
return np.c_[X, roomsPerHousehold, populationPerHousehold, bedroomsPerRoom]
else:
return np.c_[X, roomsPerHousehold, populationPerHousehold]
attrAdder = CombinedAttributesAdder(addBedroomsPerRoom=False)
housingExtraAttribs = attrAdder.transform(housing.values)
numPipeline = Pipeline([('imputer', Imputer(strategy='median')),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])
housingNumTr = numPipeline.fit_transform(housingNum)
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attributeNames):
self.attributeNames = attributeNames
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attributeNames].values
numAttribs = list(housingNum)
catAttribs = ["ocean_proximity"]
numPipeline = Pipeline([('selector', DataFrameSelector(numAttribs)),
('imputer', Imputer(strategy='median')),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),])
"""UPDATE SKLEARN TO INCLUDE CATEGORICAL ENCODER LIBRARY"""
catPipeline = Pipeline([('selector', DataFrameSelector(catAttribs)),
('cat_encoder', CategoricalEncoder(encoding='onehot-dense')),
])
fullPipeline = FeatureUnion(transformer_list=[("num_pipeline", numPipeline), ("cat_pipeline", catPipeline),])
housingPrepared = fullPipeline.fit_transform(housing)
linReg = LinearRegression()
print(housingPrepared.shape, housing_labels.shape)
linReg.fit(housingPrepared, housing_labels)
解决方案
我相信问题出在这两行:
housing = stratTrainSet.drop("median_house_value", axis=1)
housing_labels = stratTestSet["median_house_value"].copy()
将其更改为:
housing = stratTrainSet.drop("median_house_value", axis=1)
housing_labels = stratTrainSet["median_house_value"].copy()
你可以走了。
推荐阅读
- amazon-ec2 - aws ip4地址的http请求问题
- java - 如何使用 Apache Camel 的文件组件选择子文件夹?
- python - 尝试运行基本 for 循环时出现类型错误(Python)
- c - 从C中的整数中分离颜色分量
- macos - Mac Numbers : automatic get of stock exchange values
- c# - Regex.Escape - 抛出异常:C# 中 System.Text.RegularExpressions.dll 中的“System.ArgumentNullException”
- c# - UnassignedReferenceException The variable cactusSpawnPosition of GameManager has not been assigned
- javascript - how to reset div id to it's initial values in angular 9
- mysql - Updating a column if it is null in batch
- html - How to style individual p tags inside a tags?