首页 > 解决方案 > step_rose() 在调整网格中失败

问题描述

我注意到,当使用某些引擎(例如 keras 和 xgboost)进行训练时,配方返回的 ys 比 Xs 多。

在这里,您将找到一个最小的可重现示例

library(themis)
library(recipes)
library(tune)
library(parsnip)
library(workflows)
library(dials)
library(rsample)

xg_mod <- parsnip::boost_tree(mode = "classification",
                              trees = tune(),    
                              tree_depth = tune(),    
                              min_n = tune(),         
                              loss_reduction = tune(),
                              learn_rate = tune()) %>%
    set_engine("xgboost")

xg_grid <- grid_latin_hypercube(over_ratio(range = c(0,1)),
                                trees(),
                                tree_depth(),
                                min_n(),
                                loss_reduction(),
                                learn_rate(),
                                size = 5)

my_recipe <- recipe(class ~ ., data = circle_example) %>%
    step_rose(class, over_ratio = tune())

workflow() %>%
    add_model(xg_mod) %>%
    add_recipe(my_recipe) %>%
    tune_grid(resamples = mc_cv(circle_example, strata = class),
                        grid = xg_grid)

产生的错误是Error in data.frame(ynew, Xnew): arguments imply differing number of rows: 385, 386

标签: rr-carettidymodelsoversamplingr-parsnip

解决方案


这与调优有关over_ratio。如果您跳过调整它,该示例将不会出现错误。

library(tidymodels)
#> ── Attaching packages ────────────────────────────────────── tidymodels 0.1.1   
library(themis)
data(iris)

iris_imbalance <- iris %>%
  filter(Species != "setosa") %>% 
  slice_sample(n = 60, weight_by = case_when(
                                    Species == "virginica" ~ 60,
                                    TRUE ~ 1)) %>% 
  mutate(Species = factor(Species))

xg_mod <- parsnip::boost_tree(mode = "classification",
                             trees = tune(),    
                             tree_depth = tune(),    
                             min_n = tune(),         
                             loss_reduction = tune(),
                             learn_rate = tune()) %>%
  set_engine("xgboost")

xg_grid <- grid_latin_hypercube(#over_ratio(range = c(0,1)),
                                trees(),
                                tree_depth(),
                                min_n(),
                                loss_reduction(),
                                learn_rate(),
                                size = 5)

my_recipe <- recipe(Species ~ ., data = iris_imbalance) %>%
  step_rose(Species) #, over_ratio = tune())

workflow() %>%
  add_model(xg_mod) %>%
  add_recipe(my_recipe) %>%
  tune_grid(resamples = mc_cv(iris_imbalance, strata = Species),
            grid = xg_grid)
#> # Tuning results
#> # Monte Carlo cross-validation (0.75/0.25) with 25 resamples  using stratification 
#> # A tibble: 25 x 4
#>    splits          id         .metrics          .notes          
#>    <list>          <chr>      <list>            <list>          
#>  1 <split [46/14]> Resample01 <tibble [10 × 9]> <tibble [0 × 1]>
#>  2 <split [46/14]> Resample02 <tibble [10 × 9]> <tibble [0 × 1]>
#>  3 <split [46/14]> Resample03 <tibble [10 × 9]> <tibble [0 × 1]>
#>  4 <split [46/14]> Resample04 <tibble [10 × 9]> <tibble [0 × 1]>
#>  5 <split [46/14]> Resample05 <tibble [10 × 9]> <tibble [0 × 1]>
#>  6 <split [46/14]> Resample06 <tibble [10 × 9]> <tibble [0 × 1]>
#>  7 <split [46/14]> Resample07 <tibble [10 × 9]> <tibble [0 × 1]>
#>  8 <split [46/14]> Resample08 <tibble [10 × 9]> <tibble [0 × 1]>
#>  9 <split [46/14]> Resample09 <tibble [10 × 9]> <tibble [0 × 1]>
#> 10 <split [46/14]> Resample10 <tibble [10 × 9]> <tibble [0 × 1]>
#> # … with 15 more rows

reprex 包于 2020 年 11 月 13 日创建(v0.3.0)


推荐阅读