python - Scipy 和 Sklearn Yeo-Johnson 归一化结果不匹配
问题描述
我正在运行 Yeo Johnson Transform 并遵循 Scipy 网站上给出的示例。 Scipy 链接 我还将它与 Sklearn 实现进行了比较。这是代码:我
import seaborn as sns
from sklearn.preprocessing import PowerTransformer
from scipy import stats
import matplotlib.pyplot as plt
import numpy as np
fig = plt.figure( figsize=(10,10))
ax1 = fig.add_subplot(421)
x = stats.loggamma.rvs(5, size=500) + 5
prob = stats.probplot(x, dist=stats.norm, plot=ax1)
ax1.set_xlabel('')
ax1.set_title('Probplot')
ax2 = fig.add_subplot(422)
sns.distplot(x, color="skyblue")
ax2.set_title('Distribution of Data')
ax3 = fig.add_subplot(423)
xt_scipy, lmbda = stats.yeojohnson(x)
prob = stats.probplot(xt_scipy, dist=stats.norm, plot=ax3)
ax3.set_xlabel('')
ax3.set_title('Probplot:Yeo-Johnson:Scipy')
ax4 = fig.add_subplot(424)
sns.distplot(xt_scipy, color="skyblue")
ax4.set_title('Distribution of Transformed Data')
ax5 = fig.add_subplot(425)
pt = PowerTransformer(method = 'yeo-johnson',standardize = True)
xt_sklearn = pt.fit_transform(x.reshape(-1,1))
prob = stats.probplot(xt_sklearn.flatten(), dist=stats.norm, plot=ax5)
ax5.set_xlabel('')
ax5.set_title('Probplot:Yeo-Johnson:Sklearn')
ax6 = fig.add_subplot(426)
sns.distplot(xt_sklearn, color="skyblue")
ax6.set_title('Distribution of Transformed Data')
plt.tight_layout(h_pad=0.9, w_pad=0.9)
plt.show()
查看附图,可以看出两种方法似乎都按预期对数据进行了归一化,从分位数可以看出。
但是,两个库的转换数据分布图虽然形状相同,但值范围不同。为什么转换后的值不同?哪一个对应于真正的 Yeo Johnson 公式?
赛迪
解决方案
这是我的错误。我没有意识到 Sklearn 在默认情况下在 Power 转换后会进行标准缩放。这是创建匹配结果的代码的修改。
import seaborn as sns
import sklearn.preprocessing
from sklearn.preprocessing import PowerTransformer, StandardScaler
from scipy import stats
import matplotlib.pyplot as plt
import numpy as np
ss = StandardScaler()
fig = plt.figure( figsize=(10,10))
ax1 = fig.add_subplot(441)
x = stats.loggamma.rvs(5, size=500) + 5
prob = stats.probplot(x, dist=stats.norm, plot=ax1)
ax1.set_xlabel('')
ax1.set_title('Probplot')
ax2 = fig.add_subplot(442)
sns.distplot(x, color="skyblue")
ax2.set_title('Distribution of Data')
ax5 = fig.add_subplot(445)
xt_scipy, lmbda = stats.yeojohnson(x)
prob = stats.probplot(xt_scipy, dist=stats.norm, plot=ax5)
ax5.set_xlabel('')
ax5.set_title('Probplot:Yeo-Johnson:Scipy')
ax6 = fig.add_subplot(446)
sns.distplot(xt_scipy, color="skyblue")
ax6.set_title('Distribution of Transformed Data')
ax7 = fig.add_subplot(447)
xt_scipy_ss, lmbda = stats.yeojohnson(x)
xt_scipy_ss = ss.fit_transform(xt_scipy_ss.reshape(-1, 1))
prob = stats.probplot(xt_scipy_ss.flatten(), dist=stats.norm, plot=ax7)
ax7.set_xlabel('')
ax7.set_title('Probplot:Yeo-Johnson + Stand Scal:Scipy')
ax8 = fig.add_subplot(448)
sns.distplot(xt_scipy_ss, color="skyblue")
ax8.set_title('Distribution of Transformed Data')
ax9 = fig.add_subplot(449)
pt = PowerTransformer(method = 'yeo-johnson',standardize = False)
xt_sklearn = pt.fit_transform(x.reshape(-1,1))
prob = stats.probplot(xt_sklearn.flatten(), dist=stats.norm, plot=ax9)
ax9.set_xlabel('')
ax9.set_title('Probplot:Yeo-Johnson:Sklearn')
ax10 = fig.add_subplot(4,4,10)
sns.distplot(xt_sklearn, color="skyblue")
ax10.set_title('Distribution of Transformed Data')
ax11 = fig.add_subplot(4,4,11)
pt = PowerTransformer(method='yeo-johnson', standardize=True)
xt_sklearn_ss = pt.fit_transform(x.reshape(-1, 1))
prob = stats.probplot(xt_sklearn_ss.flatten(), dist=stats.norm, plot=ax11)
ax11.set_xlabel('')
ax11.set_title('Probplot:Yeo-Johnson:Sklearn with Stand Scal')
ax12 = fig.add_subplot(4, 4, 12)
sns.distplot(xt_sklearn_ss, color="skyblue")
ax12.set_title('Distribution of Transformed Data')
plt.tight_layout(h_pad=0.9, w_pad=0.9)
plt.show()
推荐阅读
- angular - Angular Modal dialog box - 如何在模态对话框中单击链接(URL)时关闭模态对话框并将它们带到单击的链接
- couchdb - 为字段添加了自定义索引,但 ORDER BY 仍然得到 couchdb no index exists for sort error hyperledger
- java - 自定义 DatePicker 以选择年份和月份
- c# - InvalidOperationException:提供了无效的请求 URI。请求 URI 必须是绝对 URI 或必须设置 BaseAddress
- php - 我如何获得地址
- validation - 使用 Javascript 验证 UserIdentityToken
- angularjs - 无法注入服务
- android - 活动生命周期回调没有完成执行?
- javascript - Javascript - 输入验证 - 我可以检查字符 x 是否为数字,字符 y 是否为字母
- javascript - Electron js不显示来自Mysql的数据