python - 带有预训练的 Sklearn GridSearch
问题描述
我正在Sklearn
Pipeline
使用由GridSearchCV
. 管道必须为实现预训练然后微调方法的几个不同实体获得最佳模型:一起预训练所有实体,微调每个元素并为每个实体返回一个模型。这些是管道的约束:
- 预训练和微调必须在同一管道中,因为这两个模型必须在每个 GridSearchCV 的折叠中具有相同的数据。
- 预训练模型必须将其权重传递给微调模型。
我已经实现:
- 一个 Sklearn 转换器,它采用一个数据框,其中包含输入中的所有实体并适合自己。
- 一个 Sklearn 回归器,它将每个实体的数据帧拆分为一个数据帧,并为每个实体拟合一个 Keras 模型。
我缺少的是如何将 Pre-train 转换器获得的权重从 Pre-train 转换器传递到 Fine-tuning 转换器(考虑到每个 GridSearchCV 折叠具有不同的权重)
这是代码:
import pandas as pd
import numpy as np
import random
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin
from sklearn.metrics import mean_squared_error
from keras.models import Model
from keras.layers import Dense, Input
import copy
class MyRegressor(BaseEstimator, TransformerMixin):
def __init__(self, neurons, featInput, featOutput):
self.neurons = neurons
self.preTrain = None
self.featInput = featInput
self.featOutput = featOutput
def fit(self, X, y=None):
X_train = X[self.featInput]
y_train = X[self.featOutput]
inputLayer = Input(shape=(len(self.featInput), ), name='INPUT')
hidden = Dense(self.neurons, name='HIDDEN')(inputLayer)
outputLayer = Dense(len(self.featOutput), name='OUTPUT')(hidden)
self.model = Model(inputLayer, outputLayer)
self.model.compile(loss='mse', optimizer='rmsprop')
if self.preTrain is not None:
self.model.loadWeights(self.preTrain)
self.model.fit(X_train, y_train)
return self
def predict(self, X):
return self.model.predict(X[self.featInput])
def transform(self, X):
return X
def score(self, X, y=None, sample_weight=None):
y_true = X[self.featOutput]
y_pred = self.predict(X)
return mean_squared_error(y_true, y_pred)
class LoopTransformer(BaseEstimator, TransformerMixin):
def __init__(self, columns, component):
self.columns = columns
self.component = component
self.components = []
def fit(self, X, y=None):
for index, idx in X[self.columns].drop_duplicates().iterrows():
entityDf = X[(X[self.columns] == idx).sum(axis=1) == len(self.columns)].copy()
self.components.append({'id': idx, 'component': copy.deepcopy(self.component)})
self.components[-1]['component'].fit(entityDf, y)
return self
def predict(self, X):
results = []
for comp in self.components:
entityDf = X[(X[self.columns] == comp['id']).sum(axis=1) == len(self.columns)].copy()
res = comp['component'].predict(entityDf)
results.append(res)
dfRes = pd.concat(results)
return dfRes
def score(self, X, y=None, sample_weight=None):
results = []
for comp in self.components:
entityDf = X[(X[self.columns] == comp['id']).sum(axis=1) == len(self.columns)].copy()
if len(entityDf) > 0:
results.append(comp['component'].score(entityDf))
return np.average(results)
#create the input dataframe: 3 entities
dataFrame = pd.DataFrame([], columns=['entityId', 'input', 'output'])
for entity in range(3):
x = np.arange(random.randint(10, 20))
y = x * (entity + 1)
tempDf = pd.DataFrame(np.array([x, y]).T, columns=['input', 'output'])
tempDf['entityId'] = entity
dataFrame = pd.concat([dataFrame, tempDf], sort=False)
dataFrame = dataFrame.reset_index(drop=True)
#create the pipeline
neurons = [5, 10]
myPipe = Pipeline([('preTrain',
MyRegressor(neurons=neurons[0], featInput=['input'], featOutput=['output'])),
('fineTuning',
LoopTransformer(['entityId'],
MyRegressor(
neurons=neurons[0],
featInput=['input'],
featOutput=['output'])))])
#pre-train and fine-tuning has to have always the same number of neurons
params = [{
'preTrain__neurons': [neurons[0]],
'fineTuning__component__neurons': [neurons[0]]
}, {
'preTrain__neurons': [neurons[1]],
'fineTuning__component__neurons': [neurons[1]]
}]
gs = GridSearchCV(myPipe, params, verbose=1, cv=3)
gs.fit(dataFrame, dataFrame)
score = gs.score(dataFrame, dataFrame)
print(score)
解决方案
我很漂亮,sklearn.Pipeline
不支持这个。但是只要您不克隆您的管道(例如,如果您使用 a 时会发生这种情况GridSearchCV
),您就可以使用类似以下代码的代码破解您的方式,该代码将管道中的一个步骤的实例提供给下一步. 您可以在管道中应用相同的原则:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from sklearn.base import BaseEstimator, TransformerMixin
class MyTransformer(BaseEstimator, TransformerMixin):
def __init__(self, scaler):
self.scaler = scaler
def fit(self, X, y=None):
print("got the means: %s" % self.scaler.mean_)
return self
def transform(self, X):
return X
X, y = load_iris(return_X_y=True)
scaler = StandardScaler()
pipeline = make_pipeline(scaler,
MyTransformer(scaler),
LogisticRegression(solver='lbfgs',
multi_class='auto'))
pipeline = pipeline.fit(X, y)
X = X - 1
pipeline = pipeline.fit(X, y)
正如预期的那样,这会给你这个输出:
got the means: [5.84333333 3.05733333 3.758 1.19933333]
got the means: [4.84333333 2.05733333 2.758 0.19933333]
推荐阅读
- python - Django ModuleNotFoundError:没有名为“ui”的模块
- hyperledger - Hyperledger Iroha - 运行示例代码时出现 MST_EXPIRED 错误
- javascript - 无法使用 Sockets 在聊天应用程序中发送消息?
- python - 如何使 pip 在 Windows 上可用于 git bash 命令行?
- android - 安卓电话类型解读
- java - GridBagLayout 中的项目未正确对齐
- c - gcc execstack 标志究竟允许哪些情况以及它是如何执行的?
- amazon-web-services - Amazon S3 存储桶策略公共访问被拒绝
- sql-server - 将数据库中的所有 datetime 列转换为 datetime2 有什么危险
- c - 为什么调用 AddNode() 函数后大小等于二叉树中添加的数据?