r - 套索射击和 glmnet 不同的结果
问题描述
我想比较lassoshooting
和glmnet
套索。
中没有标准化选项lassoshooting
;所以我首先标准化数据,拟合模型并重新标准化为原始比例。
结果不同,似乎lassoshooting
beta 更接近原始 beta。
我有错吗?
编码:
library(lassoshooting)
library(glmnet)
set.seed(327)
n = 500
p = 9
x = matrix(rnorm(n*p), ncol=p)
n = nrow(x)
b = c(.5, -.5, .25, -.25, .125, -.125, rep(0, 3))
y = x %*% b + rnorm(n, sd=.05)
xs = scale(x)
ys = scale(y)
lam = 0.1
glmnet_res = coef(glmnet(x, y), s=lam)[-1]
lassoshooting_res = lassoshooting(X=xs, y=ys, thr=1e-7, lambda=n*lam)$coefficients
# n in n*lam stems from difference between objective functions of two packages
# standard deviations for original scale
sds = apply(x,2,sd)
sdy = sd(y)
lasso_shooting_o = sdy*lassoshooting_res/sds
# compare
cbind(glmnet=glmnet_res, lassoshooting=lasso_shooting_o)
glmnet lassoshooting
[1,] 0.40123563 0.42270220
[2,] -0.38733635 -0.41195555
[3,] 0.14463257 0.16727953
[4,] -0.15914094 -0.17799495
[5,] 0.02942027 0.04958667
[6,] -0.01465437 -0.03777288
[7,] 0.00000000 0.00000000
[8,] 0.00000000 0.00000000
[9,] 0.00000000 0.00000000
# Is lassoshooting closer to true parameters ?
abs(lasso_shooting_o-b) <= abs(glmnet_res-b)
[1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
编辑:根据下面的评论,新代码
library(lassoshooting)
library(glmnet)
set.seed(327)
n = 500
p = 9
x = matrix(rnorm(n*p), ncol=p)
n = nrow(x)
b = c(5, -5, 25, -25, 125, -125, rep(0, 3))
y = x %*% b + rnorm(n, sd=.05)
# 1/n type standardization
xc = sweep(x, 2, colMeans(x))
sdc = sqrt(apply(xc, 2, crossprod)/nrow(x))
xs = sweep(xc, 2, sdc, "/")
ys = scale(y)*sqrt(n/(n-1))
lam = 0.1
# BOTH ARE STANDARDIZED: RESULTS ARE THE SAME
glmnet_std = coef(glmnet(xs, ys,standardize=F), s=lam)[-1]
lassoshooting_std = lassoshooting(X=xs, y=ys, thr=1e-7, lambda=n*lam)$coefficients
# n in n*lam stems from difference between objective functions of two packages
# compare
cbind(glmnet=glmnet_std, lassoshooting=lassoshooting_std)
#########################################################
glmnet lassoshooting
[1,] 0.00000000 0.00000000
[2,] 0.00000000 0.00000000
[3,] 0.04224107 0.04224107
[4,] -0.04178765 -0.04178765
[5,] 0.59188462 0.59188462
[6,] -0.59781943 -0.59781943
[7,] 0.00000000 0.00000000
[8,] 0.00000000 0.00000000
[9,] 0.00000000 0.00000000
#########################################################
# glmnet on ORIGINAL DATA with its own standardization
# lassoshooting on SCALED DATA (THEN RE-SCALED)
# VERY DIFFERENT RESULTS (selected variables are different too)
glmnet_o = coef(glmnet(x, y), s=lam)[-1]
# original scale
sdy = sd(y)/sqrt(n/(n-1))
lasso_shooting_o = sdy*lassoshooting_std/sdc # sdc is defined above
# compare
cbind(glmnet=glmnet_o, lassoshooting=lasso_shooting_o)
#########################################################
glmnet lassoshooting
[1,] 2.742806 0.000000
[2,] -2.412466 0.000000
[3,] 22.618378 7.379036
[4,] -23.014604 -7.205713
[5,] 122.877714 106.617676
[6,] -122.566183 -105.931439
[7,] 0.000000 0.000000
[8,] 0.000000 0.000000
[9,] 0.000000 0.000000
#########################################################
# OBVIOUSLY glmnet is correct.
解决方案
请注意,它lassoshooting
给出了与 相同的结果(无论如何都是几个数字)glmnet(xs, ys, standardize=FALSE)
,所以我们真正感兴趣的是为什么内部标准化与外部标准化不同:
> coef(glmnet(xs, ys, intercept=FALSE, standardize=FALSE), s=lam)[-1]
[1] 0.54023720669 -0.51377928289 0.21423980260 -0.23094074895 0.06158780181
[6] -0.04769218136 0.00000000000 0.00000000000 0.00000000000
> lassoshooting(X=xs, y=ys, thr=1e-7, lambda=n*lam)$coefficients
[1] 0.54023682002 -0.51377917401 0.21423976696 -0.23094082042 0.06158781750
[6] -0.04769218772 0.00000000000 0.00000000000 0.00000000000
在内部,当 glmnet 标准化时,它使用 n 的分母,而scale()
使用 n - 1。我们可以自己调整:
> coef(glmnet(x, ys * sqrt((n-1)/n)), s=lam)[-1] * sdy / sqrt((n-1)/n)
[1] 0.42270249793 -0.41195563736 0.16727956071 -0.17799489896 0.04958665639
[6] -0.03777287171 0.00000000000 0.00000000000 0.00000000000
编辑:
另请记住,glmnet 默认情况下会添加拦截,所以
glmnet(x, y, intercept=FALSE)
非常不同glmnet(x, y)
- 这也有点难以比较,因为它看起来像是lassoshooting
把它的 lambda 参数放在不同的规模上?无论如何,尝试
lassoshooting(X=cbind(1,xs), y=y, thr=1e-7, lambda=n*sqrt(n)*lam, nopenalize=0) * sdc
这很接近但不完全相同,?lassoshooting
如关于拦截的示例。
推荐阅读
- c# - 没有元素从另一个类添加到 ObservableCollection
- android - 在 onChildChanged 中调用 ArrayList set 方法创建具有更新记录的重复记录
- c++ - 基于网格的流体模拟错误
- c# - 无法在 MVC 中下载大于 2 GB 的文件
- python - Discord.py 获取消息嵌入
- ios - 当tableview中的行被选中时,tableview恢复到初始状态
- typescript - 如何显式声明某个 Phaser 类型的变量?
- css - 删除内联样式的正则表达式
- r - R计数功能
- javascript - repl.it javascript 石头、纸、剪刀作业。我错过了什么?