首页 > 解决方案 > Xgboost tweedie:为什么从链接 = exp(link)/ 2 获得预测的公式是?

问题描述

这是一个有点小众的问题,但我真的不明白。

当我运行 Tweedie GLM 时,可以通过执行 exp(link) 从链接中获取预测。为了获得 Tweedie GLM 的预测,我通过执行 exp(link)/2 从链接获得预测。我不明白为什么我需要除以 2。

下面的最小可重复示例,灵感来自https://github.com/dmlc/xgboost/blob/master/R-package/demo/tweedie_regression.R上的 tweedie 回归演示

library(xgboost)
library(data.table)
library(cplm) # for insurance data
library(statmod) # for tweedie glm
data(AutoClaim)

# auto insurance dataset analyzed by Yip and Yau (2005)
dt <- data.table(AutoClaim)

# exclude these columns from the model matrix
exclude <-  c('POLICYNO', 'PLCYDATE', 'CLM_FREQ5', 'CLM_AMT5', 'CLM_FLAG', 'IN_YY')

# retains the missing values
# NOTE: this dataset is comes ready out of the box
options(na.action = 'na.pass')
x <- sparse.model.matrix(~ . - 1, data = dt[, -exclude, with = F])
options(na.action = 'na.omit')

# response
y <- dt[, CLM_AMT5]

d_train <- xgb.DMatrix(data = x, label = y, missing = NA)

# the tweedie_variance_power parameter determines the shape of
# distribution
# - closer to 1 is more poisson like and the mass
#   is more concentrated near zero
# - closer to 2 is more gamma like and the mass spreads to the
#   the right with less concentration near zero

params <- list(
  objective = 'reg:tweedie',
  eval_metric = 'rmse',
  tweedie_variance_power = 1.4,
  max_depth = 2,
  eta = 1)
set.seed(42)
bst <- xgb.train(
  data = d_train,
  params = params,
  maximize = FALSE,
  watchlist = list(train = d_train),
  nrounds = 3)


xgb.plot.tree(model = bst)
```

# Manually extract the values for the first record :
x[1,]

# travtime < 102, bluebook <61645 -->tree #1 value= 2.49922585
# revolkedyes <  -9.53674316e-07,   npolicy < 5.5 --> tree #2  value= 2.48586464
# REVOLKEDYes <  -9.53674316e-07, areaurban >  -9.53674316e-07 --> tree #2 vakye =  2.36028123

link_gbm <-  2.49922585 +2.48586464+  2.36028123
link_gbm # 7.345372

# Take exp(link_gbm), divide by 2
exp(link_gbm ) / 2 # 774.5053

# Compare with getting prediction directly from GBM.


 predict(bst, d_train)[1] # 774.5053


# Let's do the same with a GLM:  
dt2 <-  dt[, -exclude, with = F]
dt2$CLM_AMT5 <-  dt$CLM_AMT5

tweedie_fit <-
  glm(CLM_AMT5 ~ .,
      family=tweedie(var.power=1.4, link.power=0),
      data = dt2)

summary(tweedie_fit)
# Manually get the link value for the first record

dt2[1,]
link_glm <- tweedie_fit$coefficients["(Intercept)"] +
  14 * tweedie_fit$coefficients["TRAVTIME"] +
  14230 * tweedie_fit$coefficients["BLUEBOOK"] +
  11 * tweedie_fit$coefficients["RETAINED"]  +
  1 * tweedie_fit$coefficients["NPOLICY"] +
  1 * tweedie_fit$coefficients["CAR_TYPESedan"] +
  1 * tweedie_fit$coefficients["RED_CARyes"] +
  3 * tweedie_fit$coefficients["MVR_PTS"] +
  60 * tweedie_fit$coefficients["AGE"] +
  11 * tweedie_fit$coefficients["YOJ"] +
  67349 * tweedie_fit$coefficients["INCOME"] +
  1 * tweedie_fit$coefficients["GENDERM"] +
  1 * tweedie_fit$coefficients["JOBCLASSProfessional"] +
  1 * tweedie_fit$coefficients["MAX_EDUCPhD"] +
  18 * tweedie_fit$coefficients["SAMEHOME"] +
  1 * tweedie_fit$coefficients["AREAUrban"]

link_glm #  8.299899

# prediction is exp(link_glm)

exp(link_glm) # 4023.466

# compare with link and  prediction from glm ... yes, it's identical

predict(tweedie_fit, type="link")[1]

predict(tweedie_fit, type="response")[1] # 4023.466

标签: rxgboosttweedie

解决方案


推荐阅读