首页 > 解决方案 > 在 xgboost 训练中使用 cupy 而不是 numpy 显着降低了 test-auc 分数

问题描述

背景

问题

示例代码

import xgboost as xgb
import pandas as pd

X_train=pd.read_pickle('X_train.pkl')
X_test=pd.read_pickle('X_val.pkl')
y_train=pd.read_pickle('y_train.pkl')
y_test=pd.read_pickle('y_val.pkl')

使用 numpy.ndarray

dtrain = xgb.DMatrix(X_train.values, label=y_train.values)
dtest = xgb.DMatrix(X_test.values, label=y_test.values)

num_boost_round = 1000
early_stopping_rounds=100
params = {
    'max_depth':11,
    'min_child_weight': 7,
    'eta':.1,
    'subsample': 1,
    'colsample_bytree': .8,
    'eval_metric': 'auc',
    'objective':'binary:logistic',
    #GPU enabled
    'gpu_id': 0,
    'tree_method':'gpu_hist',

}
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtrain,'Train'),(dtest, "Test")],
    early_stopping_rounds=early_stopping_rounds,
    verbose_eval = 50
)

# output
# Stopping. Best iteration:
# [999] Train-auc:0.84847   Test-auc:0.77523

使用 cupy.ndarray

import cupy
# convert to cupy
X_train_cp = cupy.array(X_train.values)
y_train_cp = cupy.array(y_train.values)
X_test_cp = cupy.array(X_test.values)
y_test_cp = cupy.array(y_test.values)

dtrain = xgb.DMatrix(X_train_cp, label=y_train_cp)
dtest = xgb.DMatrix(X_test_cp, label=y_test_cp)

num_boost_round = 1000
early_stopping_rounds=100
params = {
    'max_depth':11,
    'min_child_weight': 7,
    'eta':.1,
    'subsample': 1,
    'colsample_bytree': .8,
    'eval_metric': 'auc',
    'objective':'binary:logistic',
    #GPU enabled
    'gpu_id': 0,
    'tree_method':'gpu_hist',
}
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtrain,'Train'),(dtest, "Test")],
    early_stopping_rounds=early_stopping_rounds,
    verbose_eval = 50
)

# output
# Stopping. Best iteration:
# [182] Train-auc:0.58999   Test-auc:0.50665

# check values 
X_train_np = cupy.asnumpy(X_train_cp)
(X_train_np == X_train.values).all()
    
#output
True

xgb 版本 = ' 1.2.1 ' cupy 版本 = '8.1.0 '我正在 kaggle 上运行这个测试。如果需要更多信息,请告诉我,我可以提供训练和测试数据。

标签: pythonnumpyxgboostcupy

解决方案


推荐阅读