首页 > 解决方案 > 给定直方图计算 PDF

问题描述

我有一个严重向右倾斜的直方图,并且想计算一系列 Lifetimevalues(曲线下面积,PDF)的概率。例如,Lifetime 值在 (0-0.01) 内的概率

由按累积收入/累积安装计算的 LTV 组成的数据框:

df['LTV']

(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.208125,0.0558879,0.608348,0.212553,0.0865896,
 0.728542,0,0.609512,0,0,0,0,0,0,0,0.0801339,0.140657,0.0194118,0,0,0.0634682,
 0.339545,0.875902,0.8325,0.0260526,0.0711905,0.169894,0.202969,0.0761538,0,0.342055,
 0.42781,0,0,0.192115,0,0,0,0,0,0,0,0,0,0,0,1.6473,0,0.232329,0,2.21329,0.748,0.0424286,
 0.455439,0.210282,5.56453,0.427959,0,0.352059,0,0,0.567059,0,0,0,0.384462,1.29476,
 0.0103125,0,0.0126923,1.03356,0,0,0.289785,0,0)

我曾尝试使用 SKlearn 的 KernelDensity,但是,在将其拟合到直方图后,它并没有捕获过度表示的 0。

import gc
from sklearn.neighbors import KernelDensity

def plot_prob_density(df_lunch, field, x_start, x_end):
    plt.figure(figsize = (10, 7))

    unit = 0
    x = np.linspace(df_lunch.min() - unit, df_lunch.max() + unit, 1000)[:, np.newaxis]

    # Plot the data using a normalized histogram
    plt.hist(df_lunch, bins=200, density=True, label='LTV', color='blue', alpha=0.2)
    
    # Do kernel density estimation
    kd_lunch = KernelDensity(kernel='gaussian', bandwidth=0.00187).fit(df_lunch) #0.00187
  

    # Plot the estimated densty
    kd_vals_lunch = np.exp(kd_lunch.score_samples(x))


    plt.plot(x, kd_vals_lunch, color='orange')
    
    plt.axvline(x=x_start,color='red',linestyle='dashed')
    plt.axvline(x=x_end,color='red',linestyle='dashed')

    # Show the plots
    plt.xlabel(field, fontsize=15)
    plt.ylabel('Probability Density', fontsize=15)
    plt.legend(fontsize=15)
    plt.show()
    gc.collect()
    return kd_lunch
kd_lunch = plot_prob_density(final_df['LTV'].values.reshape(-1,1), 'LTV', x_start=0, x_end=0.01)

然后找到这样的概率:

def get_probability(start_value, end_value, eval_points, kd):
    
    # Number of evaluation points 
    N = eval_points                                      
    step = (end_value - start_value) / (N - 1)  # Step size

    x = np.linspace(start_value, end_value, N)[:, np.newaxis]  # Generate values in the range
    kd_vals = np.exp(kd.score_samples(x))  # Get PDF values for each x
    probability = np.sum(kd_vals * step)  # Approximate the integral of the PDF
    return probability.round(4)


print('Probability of LTV 0-3  tips during LUNCH time: {}\n'
      .format(get_probability(start_value = 0, 
                              end_value = 0.01, 
                              eval_points = 100, 
                              kd = kd_lunch)))

但是,这种方法不能产生我们想要的合适的 PDF 值。任何有关替代方法的建议将不胜感激。

阴谋:

在此处输入图像描述

标签: pythonprobability-density

解决方案


我在工作中使用了或多或少类似的脚本,这是我的脚本,可能对您有所帮助。

import gc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.neighbors import KernelDensity
from scipy import stats
data1 = beta_95[0]

def plot_prob_density(data1, x_start, x_end):
    plt.figure(figsize = (4, 3.5))

    unit = 1.5
    x = np.linspace(-20, 20, 1000)[:, np.newaxis]

    # Plot the data using a normalized histogram
    plt.hist(data1, bins=np.linspace(-20,20,40), density=True, color='r', alpha=0.4)
    #plt.show

    # Do kernel density estimation
    kd_data1 = KernelDensity(kernel='gaussian', bandwidth=1.8).fit(data1)

    # Plot the estimated densty
    kd_vals_data1 = np.exp(kd_data1.score_samples(x))

    plt.plot(x, kd_vals_data1, color='r', label='$N_a$', linewidth = 2)
    
    plt.axvline(x=9.95,color='green',linestyle='dashed', linewidth = 2.0, label='$β_o$')
    plt.axvline(x=1.9,color='black',linestyle='dashed', linewidth = 2.0, label='$β_b$')
    
    plt.axvline(x=x_end,color='red',linestyle='dashed', linewidth = 2, label='$β_{95\%}$')

    # Show the plots
    plt.xlabel('Beta', fontsize=10)
    plt.ylabel('Probability Density', fontsize=10)
    plt.title('02 hours window', fontsize=12)
    plt.xlim(-20, 20)
    plt.ylim(0, 0.3)
    plt.yticks([0, 0.1, 0.2, 0.3]) 
    plt.legend(fontsize=12, loc='upper left', frameon=False)
    plt.show()
    gc.collect()
    return kd_data1

def get_probability(start_value, end_value, eval_points, kd):
    
    # Number of evaluation points 
    N = eval_points                                      
    step = (end_value - start_value) / (N - 1)  # Step size

    x = np.linspace(start_value, end_value, N)[:, np.newaxis]  # Generate values in the range
    kd_vals = np.exp(kd.score_samples(x))  # Get PDF values for each x
    probability = np.sum(kd_vals * step)  # Approximate the integral of the PDF
    return probability.round(4)

data1 = np.array(data1).reshape(-1, 1)

kd_data1 = plot_prob_density(data1, x_start=3.0, x_end=13)

print('Beta-95%: {}\n'
      .format(get_probability(start_value = -10, 
                              end_value = 13, 
                              eval_points = 1000, 
                              kd = kd_data1))) 

推荐阅读