首页 > 解决方案 > xgb.train(): TypeError: float() argument must be a string or a number, not 'DMatrix'

问题描述

当我查看文档时,参数应该是“DMatrix”(xgboost 版本 1.5.0)。

https://xgboost.readthedocs.io/en/latest/python/python_api.html#:~:text=Customized%20objective%20function.-,Learning%20API,num_boost_round%20(int)%20%E2%80% 93%20Number%20of%20boosting%20iterations,- .

表示我使用的版本几乎相同(在下面的文档链接中转到子标题“1.2.2 Python”):

https://xgboost.readthedocs.io/_/downloads/en/release_1.3.0/pdf/

我不明白为什么它应该是一个 DMatrix 时要求一个浮点参数。

我查看了所有具有字符串'TypeError:float()参数必须是字符串或数字,而不是......'的堆栈帖子,但它们都没有包含'DMatrix',我无法找到我可以适应这个特定问题的解决方案。

以下是引发此错误的代码(转到'clf - xgb.train(...)'):

def grid_search(timeout_seconds, cv_splits, num_boost_round):
#   Read input data
X, y = preprocessing()
y.replace({1:0,2:1,3:2,4:3,5:4,6:5,7:6,8:7,9:8,10:9,11:10,12:11,13:12,14:13,
           15:14,16:15,17:16,18:17,19:18,20:19,21:20,22:21}, inplace = True)

#   Create dataframe to collect the results
tests_columns = ["test_nr", "cv_mean", "cv_min", "cv_max", "cv_median", "params"]
test_id = 0
tests = pd.DataFrame(columns=tests_columns)

#   Cross validation number of splits
kf = KFold(n_splits=cv_splits)

#   Execute until timeout occurs
with timeout(timeout_seconds, exception=RuntimeError):

    #   Get the grid
    grid_iter, keys, length = get_grid_iterable()
    try:

        #   For every element of the grid
        for df_grid in grid_iter:
            #   Prepare a list to collect the scores
            score = []
            params = dict(zip(keys, df_grid))

            #   The objective function
            params["objective"] = "multi:softprob"
            params['num_class'] = 22
            
            print('X.reason_action_converted: ', X.reason_action_converted)  
            #   For each fold, train XGBoost and spit out the results
            for train_index, test_index in kf.split(X.values):

                #   Get X train and X test
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]

                **#   Get y train and y test**
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                
                #   Convert into DMatrix
                d_train = xgb.DMatrix(X_train, label=y_train, missing=np.NaN)
                d_valid = xgb.DMatrix(X_test, label=y_test, missing=np.NaN)
                d_test = xgb.DMatrix(X_test, missing=np.NaN)
                watchlist = [(d_train, 'train'), (d_valid, 'valid')]

                #   Create the classifier using the current grid params. Apply early stopping of 50 rounds
                '''clf = xgb.train(params, d_train, boosting_rounds, watchlist, early_stopping_rounds=50, feval=log_loss, maximize=True, verbose_eval=10)'''
                **clf = xgb.train(params, d_train, num_boost_round, watchlist, early_stopping_rounds=50, feval=log_loss, maximize=True, verbose_eval=10)**
                y_hat = clf.predict(d_test)

                #   Append Scores on the fold kept out
                score.append(r2_score(y_test, y_hat))

            #   Store the result into a dataframe
            score_df = pd.DataFrame(columns=tests_columns, data=[
                [test_id, np.mean(score), np.min(score), np.max(score), np.median(score),
                 json.dumps(dict(zip(keys, [str(g) for g in df_grid])))]])
            test_id += 1
            tests = pd.concat([tests, score_df])
    except RuntimeError:
        #   When timeout occurs an exception is raised and the main cycle is broken
        pass

#   Spit out the results
tests.to_csv("grid-search.csv", index=False)
print(tests)


**if __name__ == "__main__":
grid_search(timeout_seconds=3600, cv_splits=4, num_boost_round=500)**

错误信息:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<command-3902447645915365> in <module>
    106 
    107 if __name__ == "__main__":
--> 108     grid_search(timeout_seconds=3600, 
cv_splits=4, num_boost_round=500)

<command-3902447645915365> in grid_search(timeout_seconds, cv_splits, num_boost_round)
     84                     #   Create the classifier using the current grid params. Apply early stopping of 50 rounds
     85                     '''clf = xgb.train(params, 
d_train, boosting_rounds, watchlist, 
early_stopping_rounds=50, feval=log_loss, 
maximize=True, verbose_eval=10)'''
---> 86                     clf = xgb.train(params, 
d_train, num_boost_round, watchlist, 
early_stopping_rounds=50, feval=log_loss, 
maximize=True, verbose_eval=10)
     87                     y_hat = clf.predict(d_test)
     88 

/databricks/python/lib/python3.8/site- 
packages/xgboost/training.py in train(params, dtrain, 
num_boost_round, evals, obj, feval, maximize, 
early_stopping_rounds, evals_result, verbose_eval, 
xgb_model, callbacks)
    204     Booster : a trained booster model
    205     """
--> 206     bst = _train_internal(params, dtrain,
    207                           
num_boost_round=num_boost_round,
    208                           evals=evals,

/databricks/python/lib/python3.8/site-packages/xgboost/training.py in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks, evals_result, maximize, verbose_eval, early_stopping_rounds)
    107         nboost += 1
    108         # check evaluation result.
--> 109         if callbacks.after_iteration(bst, i, 
dtrain, evals):
    110             break
    111         # do checkpoint after evaluation, in 
case evaluation also updates

/databricks/python/lib/python3.8/site- 
packages/xgboost/callback.py in after_iteration(self, 
model, epoch, dtrain, evals)
    421             for _, name in evals:
    422                 assert name.find('-') == -1, 
'Dataset name should not contain `-`'
--> 423             score = model.eval_set(evals, 
epoch, self.metric)
    424             score = score.split()[1:]  # into 
datasets
    425             # split up `test-error:0.1234`

/databricks/python/lib/python3.8/site- 
packages/xgboost/core.py in eval_set(self, evals, 
iteration, feval)
   1350         if feval is not None:
   1351             for dmat, evname in evals:
-> 1352                 feval_ret = 
feval(self.predict(dmat, training=False,
   1353                                                
output_margin=True), dmat)
   1354                 if isinstance(feval_ret, list):

/databricks/python/lib/python3.8/site- 
packages/sklearn/utils/validation.py in inner_f(*args, 
**kwargs)
     70                           FutureWarning)
     71         kwargs.update({k: arg for k, arg in 
zip(sig.parameters, args)})
---> 72         return f(**kwargs)
     73     return inner_f
     74 

/databricks/python/lib/python3.8/site- 
packages/sklearn/metrics/_classification.py in 
log_loss(y_true, y_pred, eps, normalize, sample_weight, 
labels)
   2184     The logarithm used is the natural logarithm 
(base-e).
   2185     """
-> 2186     y_pred = check_array(y_pred, 
ensure_2d=False)
   2187     check_consistent_length(y_pred, y_true, 
sample_weight)
   2188 

/databricks/python/lib/python3.8/site- 
packages/sklearn/utils/validation.py in inner_f(*args, 
**kwargs)
     70                           FutureWarning)
     71         kwargs.update({k: arg for k, arg in 
zip(sig.parameters, args)})
---> 72         return f(**kwargs)
     73     return inner_f
     74 

/databricks/python/lib/python3.8/site- 
packages/sklearn/utils/validation.py in 
check_array(array, accept_sparse, accept_large_sparse, 
dtype, order, copy, force_all_finite, ensure_2d,  
allow_nd, ensure_min_samples, ensure_min_features, 
estimator)
    636         # make sure we actually converted to 
numeric:
    637         if dtype_numeric and array.dtype.kind 
== "O":
--> 638             array = array.astype(np.float64)
    639         if not allow_nd and array.ndim >= 3:
    640             raise ValueError("Found array with 
dim %d. %s expected <= 2."

TypeError: float() argument must be a string or a number, not 'DMatrix'

我正在使用 Databricks、Python 3.8.8 和 xgboost 1.3.1。

我正在尝试改编以下教程中的代码:Effortless Hyperparameters Tuning with Apache Spark。

标签: python-3.xxgboostaws-databricks

解决方案


推荐阅读