python - Catboost:为什么多类分类在内部转换为回归/单类分类问题
问题描述
在多类分类中使用自定义损失函数时,我收到一个错误,即我的自定义目标函数没有calc_ders_range
属性。但是,根据我在 catboost 的 Telegram 频道中的讨论,calc_ders_range
是针对单一分类/回归的。即使我将自定义目标传递给CatBoostClassifier
.
我的代码:输出标签的int64
类型是从0
到25
代表 26 个类。自定义目标和准确性指标取自Usage Examples
示例https://catboost.ai/docs/concepts/python-usages-examples.html#user-defined-loss-function
class MyObjective(object):
def calc_ders_multi(self, approx, target, weight):
approx = np.array(approx) - max(approx)
exp_approx = np.exp(approx)
exp_sum = exp_approx.sum()
grad = []
hess = []
for j in range(len(approx)):
der1 = -exp_approx[j] / exp_sum
if j == target:
der1 += 1
hess_row = []
for j2 in range(len(approx)):
der2 = exp_approx[j] * exp_approx[j2] / (exp_sum**2)
if j2 == j:
der2 -= exp_approx[j] / exp_sum
hess_row.append(der2 * weight)
grad.append(der1 * weight)
hess.append(hess_row)
return (grad, hess)
class AccuracyMetric(object):
def get_final_error(self, error, weight):
return error / (weight + 1e-38)
def is_max_optimal(self):
return True
def evaluate(self, approxes, target, weight):
best_class = np.argmax(approxes, axis=0)
accuracy_sum = 0
weight_sum = 0
for i in range(len(target)):
w = 1.0 if weight is None else weight[i]
weight_sum += w
accuracy_sum += w * (best_class[i] == target[i])
return accuracy_sum, weight_sum
def get_pipeline(args):
"""Create a pipeline."""
pipeline_feat1 = Pipeline([
('selector', ColumnSelector(cols='feat1', drop_axis=True)),
('vec', TfidfVectorizer(tokenizer=word_tokenize)),
])
pipeline_feat2 = Pipeline([
('selector', ColumnSelector(cols='feat2', drop_axis=False)),
('imputer', SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=0, copy=False)),
('ohe', OneHotEncoder(handle_unknown='ignore')),
])
pipeline_feat3 = Pipeline([
('selector', ColumnSelector(cols='feat3', drop_axis=False)),
('imputer', SimpleImputer(missing_values=np.nan, strategy="constant", fill_value=0, copy=False)),
('ohe', OneHotEncoder(handle_unknown='ignore')),
])
features = FeatureUnion([
('f1', pipeline_feat1),
('f2', pipeline_feat2),
('f3', pipeline_feat3),
])
steps = [
('features', features),
('clf', CatBoostClassifier(task_type='CPU', iterations=5000, random_seed=0,
loss_function=MyObjective(), eval_metric=AccuracyMetric(), verbose=100))
]
train_pipeline = Pipeline(steps)
params = {
"features__f1__vec__max_features": args.f1_max_features,
"features__f1__vec__ngram_range": (1, args.f1_max_ngram)
}
params = {k: v for k, v in params.items() if v is not None}
train_pipeline.set_params(**params)
return train_pipeline
# Train model.
pipeline = get_pipeline()
# Split train and test data.
X_train, X_val, y_train, y_val = train_test_split(df_train[['feat1', 'feat2', 'feat3']], df_train['label'], train_size=0.8, random_state=21)
model = pipeline.fit(X_train, y_train)
错误信息:
AttributeError Traceback (most recent call last)
_catboost.pyx in _catboost._ObjectiveCalcDersRange()
AttributeError: 'MyObjective' object has no attribute 'calc_ders_range'
During handling of the above exception, another exception occurred:
CatBoostError Traceback (most recent call last)
<ipython-input-12-ea20f154d788> in <module>
10 train_size=0.8,
11 random_state=21)
---> 12 model = pipeline.fit(X_train, y_train)
13
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
333 if self._final_estimator != 'passthrough':
334 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
--> 335 self._final_estimator.fit(Xt, y, **fit_params_last_step)
336
337 return self
/opt/conda/lib/python3.7/site-packages/catboost/core.py in fit(self, X, y, cat_features, text_features, embedding_features, sample_weight, baseline, use_best_model, eval_set, verbose, logging_level, plot, column_description, verbose_eval, metric_period, silent, early_stopping_rounds, save_snapshot, snapshot_file, snapshot_interval, init_model)
4296 self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline, use_best_model,
4297 eval_set, verbose, logging_level, plot, column_description, verbose_eval, metric_period,
-> 4298 silent, early_stopping_rounds, save_snapshot, snapshot_file, snapshot_interval, init_model)
4299 return self
4300
/opt/conda/lib/python3.7/site-packages/catboost/core.py in _fit(self, X, y, cat_features, text_features, embedding_features, pairs, sample_weight, group_id, group_weight, subgroup_id, pairs_weight, baseline, use_best_model, eval_set, verbose, logging_level, plot, column_description, verbose_eval, metric_period, silent, early_stopping_rounds, save_snapshot, snapshot_file, snapshot_interval, init_model)
1807 params,
1808 allow_clear_pool,
-> 1809 train_params["init_model"]
1810 )
1811
/opt/conda/lib/python3.7/site-packages/catboost/core.py in _train(self, train_pool, test_pool, params, allow_clear_pool, init_model)
1256
1257 def _train(self, train_pool, test_pool, params, allow_clear_pool, init_model):
-> 1258 self._object._train(train_pool, test_pool, params, allow_clear_pool, init_model._object if init_model else None)
1259 self._set_trained_model_attributes()
1260
_catboost.pyx in _catboost._CatBoost._train()
_catboost.pyx in _catboost._CatBoost._train()
CatBoostError: catboost/python-package/catboost/helpers.cpp:42: Traceback (most recent call last):
File "_catboost.pyx", line 1345, in _catboost._ObjectiveCalcDersRange
AttributeError: 'MyObjective' object has no attribute 'calc_ders_range'
解决方案
我设法通过显式设置classes_count
参数来解决这个问题。例如
CatBoostClassifier(
iterations=10,
depth=2,
classes_count=17
推荐阅读
- c - 警告 - 赋值使指针从整数而不进行强制转换
- python - 如何为使用 django、docker 和 mysql 的项目设置最佳配置?
- r - 使用 data.table 的行均值和标准差
- android - Flutter 问题:如何隐藏状态栏同时避免信箱(notch)
- laravel - Passport - 以编程方式创建客户端凭据授予客户端
- docker - Windows 10 上的 Docker(守护进程)在启动时崩溃:“hnsCall 在 Win32 中失败:服务尚未启动”,而 hns 服务正在愉快地运行
- excel - 尝试创建数据透视表,运行时错误 91
- html - 使组件添加新组件的问题
- swift - didRequestRegionState 在 iOS 13 上失败
- excel - 通过excel vba创建表格并将数据放入word中显示错误