首页 > 解决方案 > Scipy 和 Sklearn Yeo-Johnson 归一化结果不匹配

问题描述

我正在运行 Yeo Johnson Transform 并遵循 Scipy 网站上给出的示例。 Scipy 链接 我还将它与 Sklearn 实现进行了比较。这是代码:我

    import seaborn as sns
    from sklearn.preprocessing import PowerTransformer
    from scipy import stats
    import matplotlib.pyplot as plt
    import numpy as np

    fig = plt.figure( figsize=(10,10))
    ax1 = fig.add_subplot(421)
    x = stats.loggamma.rvs(5, size=500) + 5
    prob = stats.probplot(x, dist=stats.norm, plot=ax1)
    ax1.set_xlabel('')
    ax1.set_title('Probplot')

    ax2 = fig.add_subplot(422)
    sns.distplot(x, color="skyblue")
    ax2.set_title('Distribution of Data')

    ax3 = fig.add_subplot(423)
    xt_scipy, lmbda = stats.yeojohnson(x)
    prob = stats.probplot(xt_scipy, dist=stats.norm, plot=ax3)
    ax3.set_xlabel('')
    ax3.set_title('Probplot:Yeo-Johnson:Scipy')

    ax4 = fig.add_subplot(424)
    sns.distplot(xt_scipy, color="skyblue")
    ax4.set_title('Distribution of Transformed Data')

    ax5 = fig.add_subplot(425)
    pt = PowerTransformer(method = 'yeo-johnson',standardize = True)
    xt_sklearn = pt.fit_transform(x.reshape(-1,1))
    prob = stats.probplot(xt_sklearn.flatten(), dist=stats.norm, plot=ax5)
    ax5.set_xlabel('')
    ax5.set_title('Probplot:Yeo-Johnson:Sklearn')

    ax6 = fig.add_subplot(426)
    sns.distplot(xt_sklearn, color="skyblue")
    ax6.set_title('Distribution of Transformed Data')
    plt.tight_layout(h_pad=0.9, w_pad=0.9)
    plt.show()

查看附图,可以看出两种方法似乎都按预期对数据进行了归一化,从分位数可以看出。
但是,两个库的转换数据分布图虽然形状相同,但值范围不同。为什么转换后的值不同?哪一个对应于真正的 Yeo Johnson 公式? 图1 赛迪

标签: pythonscipynormalization

解决方案


这是我的错误。我没有意识到 Sklearn 在默认情况下在 Power 转换后会进行标准缩放。这是创建匹配结果的代码的修改。

    import seaborn as sns
    import sklearn.preprocessing
    from sklearn.preprocessing import PowerTransformer, StandardScaler
    from scipy import stats
    import matplotlib.pyplot as plt
    import numpy as np

    ss = StandardScaler()
    fig = plt.figure( figsize=(10,10))
    ax1 = fig.add_subplot(441)
    x = stats.loggamma.rvs(5, size=500) + 5
    prob = stats.probplot(x, dist=stats.norm, plot=ax1)
    ax1.set_xlabel('')
    ax1.set_title('Probplot')

    ax2 = fig.add_subplot(442)
    sns.distplot(x, color="skyblue")
    ax2.set_title('Distribution of Data')

    ax5 = fig.add_subplot(445)
    xt_scipy, lmbda = stats.yeojohnson(x)
    prob = stats.probplot(xt_scipy, dist=stats.norm, plot=ax5)
    ax5.set_xlabel('')
    ax5.set_title('Probplot:Yeo-Johnson:Scipy')

    ax6 = fig.add_subplot(446)
    sns.distplot(xt_scipy, color="skyblue")
    ax6.set_title('Distribution of Transformed Data')

    ax7 = fig.add_subplot(447)
    xt_scipy_ss, lmbda = stats.yeojohnson(x)
    xt_scipy_ss = ss.fit_transform(xt_scipy_ss.reshape(-1, 1))
    prob = stats.probplot(xt_scipy_ss.flatten(), dist=stats.norm, plot=ax7)
    ax7.set_xlabel('')
    ax7.set_title('Probplot:Yeo-Johnson + Stand Scal:Scipy')

    ax8 = fig.add_subplot(448)
    sns.distplot(xt_scipy_ss, color="skyblue")
    ax8.set_title('Distribution of Transformed Data')

    ax9 = fig.add_subplot(449)
    pt = PowerTransformer(method = 'yeo-johnson',standardize = False)
    xt_sklearn = pt.fit_transform(x.reshape(-1,1))
    prob = stats.probplot(xt_sklearn.flatten(), dist=stats.norm, plot=ax9)
    ax9.set_xlabel('')
    ax9.set_title('Probplot:Yeo-Johnson:Sklearn')

    ax10 = fig.add_subplot(4,4,10)
    sns.distplot(xt_sklearn, color="skyblue")
    ax10.set_title('Distribution of Transformed Data')

    ax11 = fig.add_subplot(4,4,11)
    pt = PowerTransformer(method='yeo-johnson', standardize=True)
    xt_sklearn_ss = pt.fit_transform(x.reshape(-1, 1))
    prob = stats.probplot(xt_sklearn_ss.flatten(), dist=stats.norm, plot=ax11)
    ax11.set_xlabel('')
    ax11.set_title('Probplot:Yeo-Johnson:Sklearn with Stand Scal')

    ax12 = fig.add_subplot(4, 4, 12)
    sns.distplot(xt_sklearn_ss, color="skyblue")
    ax12.set_title('Distribution of Transformed Data')
    plt.tight_layout(h_pad=0.9, w_pad=0.9)
    plt.show()

推荐阅读