首页 > 解决方案 > 在python中将多个概率分布组合成一个分布

问题描述

我有一个涉及使用传感器的实验,我有大约 5 个数据文件,其中包含从传感器收集的时域数据。为简单起见,假设我们专注于一个传感器,我需要获得所有数据文件的概率分布。我在网上查看并设法使用以下链接找到最合适的分布:

使用 Scipy (Python) 将经验分布拟合到理论分布

就我而言,事实证明正态分布适合我的数据。因此,我有多个发行版,并希望将它们全部合并到一个发行版中。我所做的是通过获取每个密度值并将其除以 5 来平均每个概率密度。

平均代码使用以下代码完成:

def average(l):
    llen = len(l)
    def divide(x):
        return x / llen
    return map(divide, map(sum, zip(*l)))

for _ in range(5):
        # read sensor data
        # Obtain the probability distribution using code in the first link
        # Getting list of pdf:
        np_pdf = list(y_axis_pdf)

        lt.append(np_pdf)

Average_list = average(lt)
Average_list = list(Average_list)

但是,我问了几个人并在网上搜索,它说平均不是最好的方法。那么,将几个概率分布组合在一起的正确方法是什么?

我的第二个问题是我在网上搜索,发现了这篇文章:

如何组合相同数量的独立数据集

如何将第一个链接中的代码使用到文章中的方法中?

编辑1:

根据@SeverinPappadeux 的评论,我编辑了我的代码,如下所示:

# Combining all PDF files into one dataset:
pdf_data = [np_pdf_01, np_pdf_02, np_pdf_03, np_pdf_04, np_pdf_05]
pdf_dataframe_ini = pd.DataFrame(pdf_data)
pdf_dataframe = pd.DataFrame.transpose(pdf_dataframe_ini)

# Creating one PDF from the PDF dataset:
gmm = GMM(n_components=1)
gmm.fit(pdf_dataframe)
x_pdf_data = [x_axis_pdf_01, x_axis_pdf_02, x_axis_pdf_03, x_axis_pdf_04, x_axis_pdf_05]
x_pdf = average(x_pdf_data)
x_pdf = list(x_pdf)
x = np.linspace(np.min(x_pdf), np.max(x_pdf), len(x_pdf)).reshape(len(x_pdf), 1)
logprob = gmm.score_samples(x)
pdf = np.exp(logprob)

我不断收到以下错误:

logprob = gmm.score_samples(x)
ValueError: Expected the input data X have 10 features, but got 1 features

如何解决此错误并获取组合 pdf 的 pdf 图?

资料来源:

如何在 scikit-learn 下绘制拟合高斯混合模型的概率密度函数?

编辑2:

我尝试实现多元正态以将多个分布组合在一起,但是,我收到以下错误消息:

ValueError: shapes (5,2000) and (1,1) not aligned: 2000 (dim 1) != 1 (dim 0)

我将如何解决这个错误?在下面找到代码:

代码:

import scipy.stats as st
import numpy as np
import pandas as pd
import scipy.stats as st
from matplotlib import pyplot as plt
from scipy.integrate import quad,simps, quad_vec, nquad
import winsound
from functools import reduce
from itertools import chain
import scipy.stats as st
from glob import glob
from collections import defaultdict, Counter
from sklearn.neighbors import KDTree
import pywt
import peakutils
import scipy
import os
from scipy import signal
from scipy.fftpack import fft, fftfreq, rfft, rfftfreq, dst, idst, dct, idct
from scipy.signal import find_peaks, find_peaks_cwt, argrelextrema, welch, lfilter, butter, savgol_filter, medfilt, freqz, filtfilt
from pylab import *
import glob
import sys
import re
from numpy import NaN, Inf, arange, isscalar, asarray, array
from scipy.stats import skew, kurtosis, median_absolute_deviation
import warnings
import numpy as np
import pandas as pd
import scipy.stats as st
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, kendalltau, spearmanr, ppcc_max
import matplotlib.mlab as mlab
from statsmodels.graphics.tsaplots import plot_acf
from tsfresh.feature_extraction.feature_calculators import mean_abs_change as mac
from tsfresh.feature_extraction.feature_calculators import mean_change as mc
from tsfresh.feature_extraction.feature_calculators import mean_second_derivative_central as msdc
from pyAudioAnalysis.ShortTermFeatures import energy as stEnergy
import pymannkendall as mk_test
from sklearn.preprocessing import MinMaxScaler, Normalizer, normalize, StandardScaler
import time
from tsfresh.feature_extraction.feature_calculators import mean_abs_change as mac
from tsfresh.feature_extraction.feature_calculators import mean_change as mc
from tsfresh.feature_extraction.feature_calculators import absolute_sum_of_changes as asc
from tsfresh.feature_extraction.feature_calculators import mean_second_derivative_central as msdc
from sklearn.decomposition import PCA, KernelPCA, SparsePCA, IncrementalPCA
from sklearn.preprocessing import MinMaxScaler, Normalizer, normalize, StandardScaler
import circle_fit as cf
from scipy import optimize
import functools
from math import sqrt, pi
from ellipse import LsqEllipse
import time
from matplotlib.patches import Ellipse
import pandas as pd
import numpy as np
import time
from mlxtend.feature_extraction import PrincipalComponentAnalysis
from sklearn.pipeline import make_pipeline
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns # data visualization library
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from matplotlib.colors import ListedColormap
from scipy.stats import f
# from statsmodels import api as sm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import (KNeighborsClassifier,NeighborhoodComponentsAnalysis)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
from sklearn.cross_decomposition import PLSRegression
from sklearn.covariance import EmpiricalCovariance, MinCovDet
from sklearn.decomposition import kernel_pca, KernelPCA
from sklearn.decomposition import sparse_pca, SparsePCA
from sklearn.decomposition import incremental_pca, IncrementalPCA
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, normalize
from sklearn import linear_model
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold
from sklearn import linear_model
from sklearn.metrics import make_scorer
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.metrics import r2_score
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from scipy.signal import savgol_filter
# import tflearn
# import tensorflow as tf
from statistics import mean
import seaborn
import warnings
from sklearn import preprocessing, neighbors
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from scipy.stats import mstats, multivariate_normal

def normalizer(list_values):
    norm = [float(i) / sum(list_values) for i in list_values]
    return norm

lb=-10
ub=10
domain=np.arange(lb,ub,.01)
domain_size=domain.shape[0]
print(domain_size)

dist_1 = st.norm.pdf(domain, 2,1)
dist_2 = st.norm.pdf(domain, 2.5,1.5)
dist_3 = st.norm.pdf(domain, 2.2,1.6)
dist_4 = st.norm.pdf(domain, 2.4,1.3)
dist_5 = st.norm.pdf(domain, 2.7,1.5)

# dist_1_norm = normalizer(dist_1)
# dist_2_norm = normalizer(dist_2)
# dist_3_norm = normalizer(dist_3)
# dist_4_norm = normalizer(dist_4)
# dist_5_norm = normalizer(dist_5)
dists=[dist_1, dist_2, dist_3, dist_4, dist_5]

plt.xlabel("domain")
plt.ylabel("pdf")
plt.title("Conflated PDF")
plt.legend()
plt.plot(domain, st.norm.pdf(domain, 2,1), 'r', label='Dist. 1')
plt.plot(domain, st.norm.pdf(domain, 2.5,1.5), 'g', label='Dist. 2')
plt.plot(domain, st.norm.pdf(domain, 2.2,1.6), 'b', label='Dist. 3')
plt.plot(domain, st.norm.pdf(domain, 2.4,1.3), 'y', label='Dist. 4')
plt.plot(domain, st.norm.pdf(domain, 2.7,1.5), 'c', label='Dist. 5')

dists=[dist_1, dist_2, dist_3, dist_4, dist_5]
graph=multivariate_normal.pdf(dists)

plt.plot(domain,graph, 'm', label='Combined Dist.')
plt.legend()
plt.show()

标签: pythonnumpyscipystatisticsdistribution

解决方案


推荐阅读