繁体   English   中英

使用 Scipy(Python)将经验分布拟合到理论分布?

[英]Fitting empirical distribution to theoretical ones with Scipy (Python)?

简介:我有一个包含 30,000 多个 integer 值的列表,范围从 0 到 47,包括在内,例如[0,0,0,0,..,1,1,1,1,...,2,2,2,2,...,47,47,47,...]从一些连续分布中采样。 列表中的值不一定按顺序排列,但顺序对于这个问题并不重要。

问题:根据我的分布,我想计算任何给定值的 p 值(看到更大值的概率)。 例如,如您所见,0 的 p 值将接近 1,而更高数字的 p 值将趋于 0。

我不知道我是否正确,但为了确定概率,我认为我需要使我的数据符合最适合描述我的数据的理论分布。 我假设需要某种拟合优度测试来确定最佳 model。

有没有办法在 Python( ScipyNumpy )中实现这样的分析? 你能举出一些例子吗?

具有平方和误差 (SSE) 的分布拟合

这是对Saullo 答案的更新和修改,它使用当前scipy.stats分布的完整列表,并返回分布直方图和数据直方图之间SSE最小的分布。

示例配件

使用来自statsmodels的厄尔尼诺数据集,拟合分布并确定误差。 返回误差最小的分布。

所有发行版

所有拟合分布

最佳拟合分布

最佳拟合分布

示例代码

%matplotlib inline

import warnings
import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels.api as sm
from scipy.stats._continuous_distns import _distn_names
import matplotlib
import matplotlib.pyplot as plt

matplotlib.rcParams['figure.figsize'] = (16.0, 12.0)
matplotlib.style.use('ggplot')

# Create models from data
def best_fit_distribution(data, bins=200, ax=None):
    """Model data by finding best fit distribution to data"""
    # Get histogram of original data
    y, x = np.histogram(data, bins=bins, density=True)
    x = (x + np.roll(x, -1))[:-1] / 2.0

    # Best holders
    best_distributions = []

    # Estimate distribution parameters from data
    for ii, distribution in enumerate([d for d in _distn_names if not d in ['levy_stable', 'studentized_range']]):

        print("{:>3} / {:<3}: {}".format( ii+1, len(_distn_names), distribution ))

        distribution = getattr(st, distribution)

        # Try to fit the distribution
        try:
            # Ignore warnings from data that can't be fit
            with warnings.catch_warnings():
                warnings.filterwarnings('ignore')
                
                # fit dist to data
                params = distribution.fit(data)

                # Separate parts of parameters
                arg = params[:-2]
                loc = params[-2]
                scale = params[-1]
                
                # Calculate fitted PDF and error with fit in distribution
                pdf = distribution.pdf(x, loc=loc, scale=scale, *arg)
                sse = np.sum(np.power(y - pdf, 2.0))
                
                # if axis pass in add to plot
                try:
                    if ax:
                        pd.Series(pdf, x).plot(ax=ax)
                    end
                except Exception:
                    pass

                # identify if this distribution is better
                best_distributions.append((distribution, params, sse))
        
        except Exception:
            pass

    
    return sorted(best_distributions, key=lambda x:x[2])

def make_pdf(dist, params, size=10000):
    """Generate distributions's Probability Distribution Function """

    # Separate parts of parameters
    arg = params[:-2]
    loc = params[-2]
    scale = params[-1]

    # Get sane start and end points of distribution
    start = dist.ppf(0.01, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.01, loc=loc, scale=scale)
    end = dist.ppf(0.99, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.99, loc=loc, scale=scale)

    # Build PDF and turn into pandas Series
    x = np.linspace(start, end, size)
    y = dist.pdf(x, loc=loc, scale=scale, *arg)
    pdf = pd.Series(y, x)

    return pdf

# Load data from statsmodels datasets
data = pd.Series(sm.datasets.elnino.load_pandas().data.set_index('YEAR').values.ravel())

# Plot for comparison
plt.figure(figsize=(12,8))
ax = data.plot(kind='hist', bins=50, density=True, alpha=0.5, color=list(matplotlib.rcParams['axes.prop_cycle'])[1]['color'])

# Save plot limits
dataYLim = ax.get_ylim()

# Find best fit distribution
best_distibutions = best_fit_distribution(data, 200, ax)
best_dist = best_distibutions[0]

# Update plots
ax.set_ylim(dataYLim)
ax.set_title(u'El Niño sea temp.\n All Fitted Distributions')
ax.set_xlabel(u'Temp (°C)')
ax.set_ylabel('Frequency')

# Make PDF with best params 
pdf = make_pdf(best_dist[0], best_dist[1])

# Display
plt.figure(figsize=(12,8))
ax = pdf.plot(lw=2, label='PDF', legend=True)
data.plot(kind='hist', bins=50, density=True, alpha=0.5, label='Data', legend=True, ax=ax)

param_names = (best_dist[0].shapes + ', loc, scale').split(', ') if best_dist[0].shapes else ['loc', 'scale']
param_str = ', '.join(['{}={:0.2f}'.format(k,v) for k,v in zip(param_names, best_dist[1])])
dist_str = '{}({})'.format(best_dist[0].name, param_str)

ax.set_title(u'El Niño sea temp. with best fit distribution \n' + dist_str)
ax.set_xlabel(u'Temp. (°C)')
ax.set_ylabel('Frequency')

SciPy v1.6.0 中实现了 90 多个分布函数 您可以使用他们的fit()方法测试其中一些如何适合您的数据。 检查下面的代码以获取更多详细信息:

在此处输入图像描述

import matplotlib.pyplot as plt
import numpy as np
import scipy
import scipy.stats
size = 30000
x = np.arange(size)
y = scipy.int_(np.round_(scipy.stats.vonmises.rvs(5,size=size)*47))
h = plt.hist(y, bins=range(48))

dist_names = ['gamma', 'beta', 'rayleigh', 'norm', 'pareto']

for dist_name in dist_names:
    dist = getattr(scipy.stats, dist_name)
    params = dist.fit(y)
    arg = params[:-2]
    loc = params[-2]
    scale = params[-1]
    if arg:
        pdf_fitted = dist.pdf(x, *arg, loc=loc, scale=scale) * size
    else:
        pdf_fitted = dist.pdf(x, loc=loc, scale=scale) * size
    plt.plot(pdf_fitted, label=dist_name)
    plt.xlim(0,47)
plt.legend(loc='upper right')
plt.show()

参考:

- 拟合分布、拟合优度、p 值。 是否可以使用 Scipy (Python)来做到这一点?

- 带 Scipy 的配电配件

这里列出了 Scipy 0.12.0 (VI) 中可用的所有分布函数的名称:

dist_names = [ 'alpha', 'anglit', 'arcsine', 'beta', 'betaprime', 'bradford', 'burr', 'cauchy', 'chi', 'chi2', 'cosine', 'dgamma', 'dweibull', 'erlang', 'expon', 'exponweib', 'exponpow', 'f', 'fatiguelife', 'fisk', 'foldcauchy', 'foldnorm', 'frechet_r', 'frechet_l', 'genlogistic', 'genpareto', 'genexpon', 'genextreme', 'gausshyper', 'gamma', 'gengamma', 'genhalflogistic', 'gilbrat', 'gompertz', 'gumbel_r', 'gumbel_l', 'halfcauchy', 'halflogistic', 'halfnorm', 'hypsecant', 'invgamma', 'invgauss', 'invweibull', 'johnsonsb', 'johnsonsu', 'ksone', 'kstwobign', 'laplace', 'logistic', 'loggamma', 'loglaplace', 'lognorm', 'lomax', 'maxwell', 'mielke', 'nakagami', 'ncx2', 'ncf', 'nct', 'norm', 'pareto', 'pearson3', 'powerlaw', 'powerlognorm', 'powernorm', 'rdist', 'reciprocal', 'rayleigh', 'rice', 'recipinvgauss', 'semicircular', 't', 'triang', 'truncexpon', 'truncnorm', 'tukeylambda', 'uniform', 'vonmises', 'wald', 'weibull_min', 'weibull_max', 'wrapcauchy'] 

@Saullo Castro 提到的fit()方法提供最大似然估计 (MLE)。 您的数据的最佳分布是给您最高的分布,可以通过几种不同的方式确定:例如

1,给你最高对数可能性的那个。

2,给你最小的AIC,BIC或BICc值的那个(参见wiki: http://en.wikipedia.org/wiki/Akaike_information_criterion ,基本上可以看作是根据参数数量调整的对数似然,作为分布更多参数预计更适合)

3,最大化贝叶斯后验概率的一种。 (参见维基: http://en.wikipedia.org/wiki/Posterior_probability

当然,如果您已经有一个应该描述您的数据的分布(基于您特定领域的理论)并且想要坚持下去,您将跳过确定最佳拟合分布的步骤。

scipy不附带 function 来计算对数似然度(尽管提供了 MLE 方法),但硬代码一很容易:请参阅`scipy.stat.distributions` 的内置概率密度函数是否比用户提供的慢?

试试distfit库。

pip 安装分布

# Create 1000 random integers, value between [0-50]
X = np.random.randint(0, 50,1000)

# Retrieve P-value for y
y = [0,10,45,55,100]

# From the distfit library import the class distfit
from distfit import distfit

# Initialize.
# Set any properties here, such as alpha.
# The smoothing can be of use when working with integers. Otherwise your histogram
# may be jumping up-and-down, and getting the correct fit may be harder.
dist = distfit(alpha=0.05, smooth=10)

# Search for best theoretical fit on your empirical data
dist.fit_transform(X)

> [distfit] >fit..
> [distfit] >transform..
> [distfit] >[norm      ] [RSS: 0.0037894] [loc=23.535 scale=14.450] 
> [distfit] >[expon     ] [RSS: 0.0055534] [loc=0.000 scale=23.535] 
> [distfit] >[pareto    ] [RSS: 0.0056828] [loc=-384473077.778 scale=384473077.778] 
> [distfit] >[dweibull  ] [RSS: 0.0038202] [loc=24.535 scale=13.936] 
> [distfit] >[t         ] [RSS: 0.0037896] [loc=23.535 scale=14.450] 
> [distfit] >[genextreme] [RSS: 0.0036185] [loc=18.890 scale=14.506] 
> [distfit] >[gamma     ] [RSS: 0.0037600] [loc=-175.505 scale=1.044] 
> [distfit] >[lognorm   ] [RSS: 0.0642364] [loc=-0.000 scale=1.802] 
> [distfit] >[beta      ] [RSS: 0.0021885] [loc=-3.981 scale=52.981] 
> [distfit] >[uniform   ] [RSS: 0.0012349] [loc=0.000 scale=49.000] 

# Best fitted model
best_distr = dist.model
print(best_distr)

# Uniform shows best fit, with 95% CII (confidence intervals), and all other parameters
> {'distr': <scipy.stats._continuous_distns.uniform_gen at 0x16de3a53160>,
>  'params': (0.0, 49.0),
>  'name': 'uniform',
>  'RSS': 0.0012349021241149533,
>  'loc': 0.0,
>  'scale': 49.0,
>  'arg': (),
>  'CII_min_alpha': 2.45,
>  'CII_max_alpha': 46.55}

# Ranking distributions
dist.summary

# Plot the summary of fitted distributions
dist.plot_summary()

在此处输入图像描述

# Make prediction on new datapoints based on the fit
dist.predict(y)

# Retrieve your pvalues with 
dist.y_pred
# array(['down', 'none', 'none', 'up', 'up'], dtype='<U4')
dist.y_proba
array([0.02040816, 0.02040816, 0.02040816, 0.        , 0.        ])

# Or in one dataframe
dist.df

# The plot function will now also include the predictions of y
dist.plot()

最合适

请注意,在这种情况下,由于均匀分布,所有点都将很重要。 如果需要,您可以使用 dist.y_pred 进行过滤。

AFAICU,您的分布是离散的(而且只是离散的)。 因此,仅计算不同值的频率并将它们归一化就足以满足您的目的。 所以,一个例子来证明这一点:

In []: values= [0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 4]
In []: counts= asarray(bincount(values), dtype= float)
In []: cdf= counts.cumsum()/ counts.sum()

因此,看到高于1的值的概率很简单(根据互补累积分布 function (ccdf)

In []: 1- cdf[1]
Out[]: 0.40000000000000002

请注意, ccdf生存 function (sf)密切相关,但它也定义为离散分布,而sf仅定义为连续分布。

以下代码是一般答案的版本,但有更正和清晰。

import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels.api as sm
import matplotlib as mpl
import matplotlib.pyplot as plt
import math
import random

mpl.style.use("ggplot")

def danoes_formula(data):
    """
    DANOE'S FORMULA
    https://en.wikipedia.org/wiki/Histogram#Doane's_formula
    """
    N = len(data)
    skewness = st.skew(data)
    sigma_g1 = math.sqrt((6*(N-2))/((N+1)*(N+3)))
    num_bins = 1 + math.log(N,2) + math.log(1+abs(skewness)/sigma_g1,2)
    num_bins = round(num_bins)
    return num_bins

def plot_histogram(data, results, n):
    ## n first distribution of the ranking
    N_DISTRIBUTIONS = {k: results[k] for k in list(results)[:n]}

    ## Histogram of data
    plt.figure(figsize=(10, 5))
    plt.hist(data, density=True, ec='white', color=(63/235, 149/235, 170/235))
    plt.title('HISTOGRAM')
    plt.xlabel('Values')
    plt.ylabel('Frequencies')

    ## Plot n distributions
    for distribution, result in N_DISTRIBUTIONS.items():
        # print(i, distribution)
        sse = result[0]
        arg = result[1]
        loc = result[2]
        scale = result[3]
        x_plot = np.linspace(min(data), max(data), 1000)
        y_plot = distribution.pdf(x_plot, loc=loc, scale=scale, *arg)
        plt.plot(x_plot, y_plot, label=str(distribution)[32:-34] + ": " + str(sse)[0:6], color=(random.uniform(0, 1), random.uniform(0, 1), random.uniform(0, 1)))
    
    plt.legend(title='DISTRIBUTIONS', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.show()

def fit_data(data):
    ## st.frechet_r,st.frechet_l: are disbled in current SciPy version
    ## st.levy_stable: a lot of time of estimation parameters
    ALL_DISTRIBUTIONS = [        
        st.alpha,st.anglit,st.arcsine,st.beta,st.betaprime,st.bradford,st.burr,st.cauchy,st.chi,st.chi2,st.cosine,
        st.dgamma,st.dweibull,st.erlang,st.expon,st.exponnorm,st.exponweib,st.exponpow,st.f,st.fatiguelife,st.fisk,
        st.foldcauchy,st.foldnorm, st.genlogistic,st.genpareto,st.gennorm,st.genexpon,
        st.genextreme,st.gausshyper,st.gamma,st.gengamma,st.genhalflogistic,st.gilbrat,st.gompertz,st.gumbel_r,
        st.gumbel_l,st.halfcauchy,st.halflogistic,st.halfnorm,st.halfgennorm,st.hypsecant,st.invgamma,st.invgauss,
        st.invweibull,st.johnsonsb,st.johnsonsu,st.ksone,st.kstwobign,st.laplace,st.levy,st.levy_l,
        st.logistic,st.loggamma,st.loglaplace,st.lognorm,st.lomax,st.maxwell,st.mielke,st.nakagami,st.ncx2,st.ncf,
        st.nct,st.norm,st.pareto,st.pearson3,st.powerlaw,st.powerlognorm,st.powernorm,st.rdist,st.reciprocal,
        st.rayleigh,st.rice,st.recipinvgauss,st.semicircular,st.t,st.triang,st.truncexpon,st.truncnorm,st.tukeylambda,
        st.uniform,st.vonmises,st.vonmises_line,st.wald,st.weibull_min,st.weibull_max,st.wrapcauchy
    ]
    
    MY_DISTRIBUTIONS = [st.beta, st.expon, st.norm, st.uniform, st.johnsonsb, st.gennorm, st.gausshyper]

    ## Calculae Histogram
    num_bins = danoes_formula(data)
    frequencies, bin_edges = np.histogram(data, num_bins, density=True)
    central_values = [(bin_edges[i] + bin_edges[i+1])/2 for i in range(len(bin_edges)-1)]

    results = {}
    for distribution in MY_DISTRIBUTIONS:
        ## Get parameters of distribution
        params = distribution.fit(data)
        
        ## Separate parts of parameters
        arg = params[:-2]
        loc = params[-2]
        scale = params[-1]
    
        ## Calculate fitted PDF and error with fit in distribution
        pdf_values = [distribution.pdf(c, loc=loc, scale=scale, *arg) for c in central_values]
        
        ## Calculate SSE (sum of squared estimate of errors)
        sse = np.sum(np.power(frequencies - pdf_values, 2.0))
        
        ## Build results and sort by sse
        results[distribution] = [sse, arg, loc, scale]
        
    results = {k: results[k] for k in sorted(results, key=results.get)}
    return results
        
def main():
    ## Import data
    data = pd.Series(sm.datasets.elnino.load_pandas().data.set_index('YEAR').values.ravel())
    results = fit_data(data)
    plot_histogram(data, results, 5)

if __name__ == "__main__":
    main()
    
    

在此处输入图像描述

虽然上述许多答案都是完全有效的,但似乎没有人完全回答您的问题,特别是以下部分:

我不知道我是否正确,但要确定概率,我认为我需要将我的数据拟合到最适合描述我的数据的理论分布。 我假设需要某种拟合优度来确定最佳 model。

参数化方法

这是您描述的使用一些理论分布并将参数拟合到数据的过程,并且有一些很好的答案如何做到这一点。

非参数方法

但是,也可以对您的问题使用非参数方法,这意味着您根本不假设任何基础分布。

通过使用所谓的经验分布 function 等于: Fn(x)= SUM( I[X<=x] ) / n 所以低于 x 的值的比例。

正如上述答案之一所指出的,您感兴趣的是逆 CDF(累积分布函数),它等于1-F(x)

可以证明,经验分布 function 将收敛到生成数据的任何“真实”CDF。

此外,通过以下方式构建 1-alpha 置信区间很简单:

L(X) = max{Fn(x)-en, 0}
U(X) = min{Fn(x)+en, 0}
en = sqrt( (1/2n)*log(2/alpha)

然后P( L(X) <= F(X) <= U(X) ) >= 1-alpha对于所有 x。

我很惊讶PierrOz 的答案有 0 票,而它是使用非参数方法估计 F(x) 的问题的完全有效答案。

请注意,对于任何 x>47,您提到的 P(X>=x)=0 问题只是个人偏好,可能会导致您选择参数方法而不是非参数方法。 然而,这两种方法都是解决您的问题的完全有效的方法。

有关上述陈述的更多详细信息和证明,我建议您查看“所有统计数据:Larry A. Wasserman 的统计推断简明课程”。 一本关于参数和非参数推理的优秀书籍。

编辑:由于您特别要求提供一些 python 示例,因此可以使用 numpy 来完成:

import numpy as np

def empirical_cdf(data, x):
    return np.sum(x<=data)/len(data)

def p_value(data, x):
    return 1-empirical_cdf(data, x)

# Generate some data for demonstration purposes
data = np.floor(np.random.uniform(low=0, high=48, size=30000))

print(empirical_cdf(data, 20))
print(p_value(data, 20)) # This is the value you're interested in

对我来说,这听起来像是概率密度估计问题。

from scipy.stats import gaussian_kde
occurences = [0,0,0,0,..,1,1,1,1,...,2,2,2,2,...,47]
values = range(0,48)
kde = gaussian_kde(map(float, occurences))
p = kde(values)
p = p/sum(p)
print "P(x>=1) = %f" % sum(p[1:])

另见http://jpktd.blogspot.com/2009/03/using-gaussian-kernel-density.html

使用OpenTURNS ,我将使用 BIC 标准 select 适合此类数据的最佳分布。 这是因为这个标准并没有给具有更多参数的分布带来太多优势。 事实上,如果一个分布具有更多参数,则拟合分布更容易更接近数据。 此外,Kolmogorov-Smirnov 在这种情况下可能没有意义,因为测量值中的一个小误差将对 p 值产生巨大影响。

为了说明这个过程,我加载了厄尔尼诺数据,其中包含从 1950 年到 2010 年的 732 次每月温度测量值:

import statsmodels.api as sm
dta = sm.datasets.elnino.load_pandas().data
dta['YEAR'] = dta.YEAR.astype(int).astype(str)
dta = dta.set_index('YEAR').T.unstack()
data = dta.values

使用GetContinuousUniVariateFactories static 方法很容易获得 30 个内置的单变量工厂分布。 完成后, BestModelBIC static 方法将返回最佳 model 和相应的 BIC 分数。

sample = ot.Sample([[p] for p in data]) # data reshaping
tested_factories = ot.DistributionFactory.GetContinuousUniVariateFactories()
best_model, best_bic = ot.FittingTest.BestModelBIC(sample,
                                                   tested_factories)
print("Best=",best_model)

打印:

Best= Beta(alpha = 1.64258, beta = 2.4348, a = 18.936, b = 29.254)

为了以图形方式比较拟合直方图,我使用了最佳分布的drawPDF方法。

import openturns.viewer as otv
graph = ot.HistogramFactory().build(sample).drawPDF()
bestPDF = best_model.drawPDF()
bestPDF.setColors(["blue"])
graph.add(bestPDF)
graph.setTitle("Best BIC fit")
name = best_model.getImplementation().getClassName()
graph.setLegends(["Histogram",name])
graph.setXTitle("Temperature (°C)")
otv.View(graph)

这会产生:

Beta 适合厄尔尼诺温度

BestModelBIC文档中提供了有关此主题的更多详细信息。 可以在 SciPyDistribution 中包含 Scipy 分发版,甚至可以在ChaosPy分发版中包含ChaosPyDistribution ,但我猜当前脚本可以满足大多数实际目的。

如果我不理解您的需求,请原谅我,但是将您的数据存储在字典中,其中键是 0 到 47 之间的数字,并且值是原始列表中相关键的出现次数?
因此,您的可能性 p(x) 将是大于 x 的所有键值的总和除以 30000。

我发现最简单的方法是使用 fitter 模块,您可以简单地pip install fitter 您所要做的就是通过 pandas 导入数据集。 它内置了 function 从 scipy 库中搜索所有 80 个分布到数据,并通过各种方法获得最佳拟合。 例如,

f = Fitter(height, distributions=['gamma','lognorm', "beta","burr","norm"])
f.fit()
f.summary()

在这里,作者提供了一个分布列表,因为扫描所有 80 个可能很耗时。

f.get_best(method = 'sumsquare_error')

这将为您提供 5 个最佳分布及其参数。

            sumsquare_error    aic          bic       kl_div
chi2             0.000010  1716.234916 -1945.821606     inf
gamma            0.000010  1716.234909 -1945.821606     inf
rayleigh         0.000010  1711.807360 -1945.526026     inf
norm             0.000011  1758.797036 -1934.865211     inf
cauchy           0.000011  1762.735606 -1934.803414     inf

这就是 output 的样子。

您还拥有distributions=get_common_distributions()属性,其中包含大约 10 个最常用的分布,并为您拟合和检查它们。

它还具有许多其他功能,例如绘制直方图,所有完整的文档都可以在这里找到。对于科学家、工程师和一般来说,它是一个被严重低估的模块。

我从第一个答案重新设计了分布 function,其中我包含了一个选择参数,用于选择一个拟合优度测试,这将缩小适合数据的分布 function:

import numpy as np
import pandas as pd
import scipy.stats as st
import matplotlib.pyplot as plt
import pylab

def make_hist(data):
    #### General code:
    bins_formulas = ['auto', 'fd', 'scott', 'rice', 'sturges', 'doane', 'sqrt']
    bins = np.histogram_bin_edges(a=data, bins='fd', range=(min(data), max(data)))
    # print('Bin value = ', bins)

    # Obtaining the histogram of data:
    Hist, bin_edges = histogram(a=data, bins=bins, range=(min(data), max(data)), density=True)
    bin_mid = (bin_edges + np.roll(bin_edges, -1))[:-1] / 2.0  # go from bin edges to bin middles
    return Hist, bin_mid

def make_pdf(dist, params, size):
    """Generate distributions's Probability Distribution Function """

    # Separate parts of parameters
    arg = params[:-2]
    loc = params[-2]
    scale = params[-1]

    # Get sane start and end points of distribution
    start = dist.ppf(0.01, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.01, loc=loc, scale=scale)
    end = dist.ppf(0.99, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.99, loc=loc, scale=scale)

    # Build PDF and turn into pandas Series
    x = np.linspace(start, end, size)
    y = dist.pdf(x, loc=loc, scale=scale, *arg)
    pdf = pd.Series(y, x)
    return pdf, x, y

def compute_r2_test(y_true, y_predicted):
    sse = sum((y_true - y_predicted)**2)
    tse = (len(y_true) - 1) * np.var(y_true, ddof=1)
    r2_score = 1 - (sse / tse)
    return r2_score, sse, tse

def get_best_distribution_2(data, method, plot=False):
    dist_names = ['alpha', 'anglit', 'arcsine', 'beta', 'betaprime', 'bradford', 'burr', 'cauchy', 'chi', 'chi2', 'cosine', 'dgamma', 'dweibull', 'erlang', 'expon', 'exponweib', 'exponpow', 'f', 'fatiguelife', 'fisk', 'foldcauchy', 'foldnorm', 'frechet_r', 'frechet_l', 'genlogistic', 'genpareto', 'genexpon', 'genextreme', 'gausshyper', 'gamma', 'gengamma', 'genhalflogistic', 'gilbrat',  'gompertz', 'gumbel_r', 'gumbel_l', 'halfcauchy', 'halflogistic', 'halfnorm', 'hypsecant', 'invgamma', 'invgauss', 'invweibull', 'johnsonsb', 'johnsonsu', 'ksone', 'kstwobign', 'laplace', 'logistic', 'loggamma', 'loglaplace', 'lognorm', 'lomax', 'maxwell', 'mielke', 'moyal', 'nakagami', 'ncx2', 'ncf', 'nct', 'norm', 'pareto', 'pearson3', 'powerlaw', 'powerlognorm', 'powernorm', 'rdist', 'reciprocal', 'rayleigh', 'rice', 'recipinvgauss', 'semicircular', 't', 'triang', 'truncexpon', 'truncnorm', 'tukeylambda', 'uniform', 'vonmises', 'wald', 'weibull_min', 'weibull_max', 'wrapcauchy']
    
    # Applying the Goodness-to-fit tests to select the best distribution that fits the data:
    dist_results = []
    dist_IC_results = []
    params = {}
    params_IC = {}
    params_SSE = {}

    if method == 'sse':
########################################################################################################################
######################################## Sum of Square Error (SSE) test ################################################
########################################################################################################################
        # Best holders
        best_distribution = st.norm
        best_params = (0.0, 1.0)
        best_sse = np.inf

        for dist_name in dist_names:
            dist = getattr(st, dist_name)
            param = dist.fit(data)
            params[dist_name] = param
            N_len = len(list(data))
            # Obtaining the histogram:
            Hist_data, bin_data = make_hist(data=data)

            # fit dist to data
            params_dist = dist.fit(data)

            # Separate parts of parameters
            arg = params_dist[:-2]
            loc = params_dist[-2]
            scale = params_dist[-1]

            # Calculate fitted PDF and error with fit in distribution
            pdf = dist.pdf(bin_data, loc=loc, scale=scale, *arg)
            sse = np.sum(np.power(Hist_data - pdf, 2.0))

            # identify if this distribution is better
            if best_sse > sse > 0:
                best_distribution = dist
                best_params = params_dist
                best_stat_test_val = sse

        print('\n################################ Sum of Square Error test parameters #####################################')
        best_dist = best_distribution
        print("Best fitting distribution (SSE test) :" + str(best_dist))
        print("Best SSE value (SSE test) :" + str(best_stat_test_val))
        print("Parameters for the best fit (SSE test) :" + str(params[best_dist]))
        print('###########################################################################################################\n')

########################################################################################################################
########################################################################################################################
########################################################################################################################

    if method == 'r2':
########################################################################################################################
##################################################### R Square (R^2) test ##############################################
########################################################################################################################
    # Best holders
    best_distribution = st.norm
    best_params = (0.0, 1.0)
    best_r2 = np.inf

    for dist_name in dist_names:
        dist = getattr(st, dist_name)
        param = dist.fit(data)
        params[dist_name] = param
        N_len = len(list(data))
        # Obtaining the histogram:
        Hist_data, bin_data = make_hist(data=data)

        # fit dist to data
        params_dist = dist.fit(data)

        # Separate parts of parameters
        arg = params_dist[:-2]
        loc = params_dist[-2]
        scale = params_dist[-1]

        # Calculate fitted PDF and error with fit in distribution
        pdf = dist.pdf(bin_data, loc=loc, scale=scale, *arg)
        r2 = compute_r2_test(y_true=Hist_data, y_predicted=pdf)

        # identify if this distribution is better
        if best_r2 > r2 > 0:
            best_distribution = dist
            best_params = params_dist
            best_stat_test_val = r2

    print('\n############################## R Square test parameters ###########################################')
    best_dist = best_distribution
    print("Best fitting distribution (R^2 test) :" + str(best_dist))
    print("Best R^2 value (R^2 test) :" + str(best_stat_test_val))
    print("Parameters for the best fit (R^2 test) :" + str(params[best_dist]))
    print('#####################################################################################################\n')

########################################################################################################################
########################################################################################################################
########################################################################################################################

    if method == 'ic':
########################################################################################################################
######################################## Information Criteria (IC) test ################################################
########################################################################################################################
        for dist_name in dist_names:
            dist = getattr(st, dist_name)
            param = dist.fit(data)
            params[dist_name] = param
            N_len = len(list(data))

            # Obtaining the histogram:
            Hist_data, bin_data = make_hist(data=data)

            # fit dist to data
            params_dist = dist.fit(data)

            # Separate parts of parameters
            arg = params_dist[:-2]
            loc = params_dist[-2]
            scale = params_dist[-1]

            # Calculate fitted PDF and error with fit in distribution
            pdf = dist.pdf(bin_data, loc=loc, scale=scale, *arg)
            sse = np.sum(np.power(Hist_data - pdf, 2.0))

            # Obtaining the log of the pdf:
            loglik = np.sum(dist.logpdf(bin_data, *params_dist))
            k = len(params_dist[:])
            n = len(data)
            aic = 2 * k - 2 * loglik
            bic = n * np.log(sse / n) + k * np.log(n)
            dist_IC_results.append((dist_name, aic))
            # dist_IC_results.append((dist_name, bic))

        # select the best fitted distribution and store the name of the best fit and its IC value
        best_dist, best_ic = (min(dist_IC_results, key=lambda item: item[1]))

        print('\n############################ Information Criteria (IC) test parameters ##################################')
        print("Best fitting distribution (IC test) :" + str(best_dist))
        print("Best IC value (IC test) :" + str(best_ic))
        print("Parameters for the best fit (IC test) :" + str(params[best_dist]))
        print('###########################################################################################################\n')

########################################################################################################################
########################################################################################################################
########################################################################################################################

    if method == 'chi':
########################################################################################################################
################################################ Chi-Square (Chi^2) test ###############################################
########################################################################################################################
        # Set up 50 bins for chi-square test
        # Observed data will be approximately evenly distrubuted aross all bins
        percentile_bins = np.linspace(0,100,51)
        percentile_cutoffs = np.percentile(data, percentile_bins)
        observed_frequency, bins = (np.histogram(data, bins=percentile_cutoffs))
        cum_observed_frequency = np.cumsum(observed_frequency)

        chi_square = []
        for dist_name in dist_names:
            dist = getattr(st, dist_name)
            param = dist.fit(data)
            params[dist_name] = param

            # Obtaining the histogram:
            Hist_data, bin_data = make_hist(data=data)

            # fit dist to data
            params_dist = dist.fit(data)

            # Separate parts of parameters
            arg = params_dist[:-2]
            loc = params_dist[-2]
            scale = params_dist[-1]

            # Calculate fitted PDF and error with fit in distribution
            pdf = dist.pdf(bin_data, loc=loc, scale=scale, *arg)

            # Get expected counts in percentile bins
            # This is based on a 'cumulative distrubution function' (cdf)
            cdf_fitted = dist.cdf(percentile_cutoffs, *arg, loc=loc, scale=scale)
            expected_frequency = []
            for bin in range(len(percentile_bins) - 1):
                expected_cdf_area = cdf_fitted[bin + 1] - cdf_fitted[bin]
                expected_frequency.append(expected_cdf_area)

            # calculate chi-squared
            expected_frequency = np.array(expected_frequency) * size
            cum_expected_frequency = np.cumsum(expected_frequency)
            ss = sum(((cum_expected_frequency - cum_observed_frequency) ** 2) / cum_observed_frequency)
            chi_square.append(ss)

            # Applying the Chi-Square test:
            # D, p = scipy.stats.chisquare(f_obs=pdf, f_exp=Hist_data)
            # print("Chi-Square test Statistics value for " + dist_name + " = " + str(D))
            print("p value for " + dist_name + " = " + str(chi_square))
            dist_results.append((dist_name, chi_square))

        # select the best fitted distribution and store the name of the best fit and its p value
        best_dist, best_stat_test_val = (min(dist_results, key=lambda item: item[1]))

        print('\n#################################### Chi-Square test parameters #######################################')
        print("Best fitting distribution (Chi^2 test) :" + str(best_dist))
        print("Best p value (Chi^2 test) :" + str(best_stat_test_val))
        print("Parameters for the best fit (Chi^2 test) :" + str(params[best_dist]))
        print('#########################################################################################################\n')

########################################################################################################################
########################################################################################################################
########################################################################################################################

    if method == 'ks':
########################################################################################################################
########################################## Kolmogorov-Smirnov (KS) test ################################################
########################################################################################################################
        for dist_name in dist_names:
            dist = getattr(st, dist_name)
            param = dist.fit(data)
            params[dist_name] = param

            # Applying the Kolmogorov-Smirnov test:
            D, p = st.kstest(data, dist_name, args=param)
            # D, p = st.kstest(data, dist_name, args=param, N=N_len, alternative='greater')
            # print("Kolmogorov-Smirnov test Statistics value for " + dist_name + " = " + str(D))
            print("p value for " + dist_name + " = " + str(p))
            dist_results.append((dist_name, p))

        # select the best fitted distribution and store the name of the best fit and its p value
        best_dist, best_stat_test_val = (max(dist_results, key=lambda item: item[1]))

        print('\n################################ Kolmogorov-Smirnov test parameters #####################################')
        print("Best fitting distribution (KS test) :" + str(best_dist))
        print("Best p value (KS test) :" + str(best_stat_test_val))
        print("Parameters for the best fit (KS test) :" + str(params[best_dist]))
        print('###########################################################################################################\n')

########################################################################################################################
########################################################################################################################
########################################################################################################################

    # Collate results and sort by goodness of fit (best at top)
    results = pd.DataFrame()
    results['Distribution'] = dist_names
    results['chi_square'] = chi_square
    # results['p_value'] = p_values
    results.sort_values(['chi_square'], inplace=True)

    # Plotting the distribution with histogram:
    if plot:
        bins_val = np.histogram_bin_edges(a=data, bins='fd', range=(min(data), max(data)))
        plt.hist(x=data, bins=bins_val, range=(min(data), max(data)), density=True)
        # pylab.hist(x=data, bins=bins_val, range=(min(data), max(data)))
        best_param = params[best_dist]
        best_dist_p = getattr(st, best_dist)
        pdf, x_axis_pdf, y_axis_pdf = make_pdf(dist=best_dist_p, params=best_param, size=len(data))
        plt.plot(x_axis_pdf, y_axis_pdf, color='red', label='Best dist ={0}'.format(best_dist))
        plt.legend()
        plt.title('Histogram and Distribution plot of data')
        # plt.show()
        plt.show(block=False)
        plt.pause(5)  # Pauses the program for 5 seconds
        plt.close('all')

    return best_dist, best_stat_test_val, params[best_dist]

然后继续 make_pdf function 以根据您的拟合优度测试获取选定的分布。

基于Timothy Davenports 的回答,我将代码重构为可用作库,并将其作为 github 和 pypi 项目使用,请参阅:

一个目标是使密度选项可用,并将 output 结果作为文件。 查看实现的主要部分:

    bfd=BestFitDistribution(data)
    for density in [True,False]:
        suffix="density" if density else ""
        bfd.analyze(title=u'El Niño sea temp.',x_label=u'Temp (°C)',y_label='Frequency',outputFilePrefix=f"/tmp/ElNinoPDF{suffix}",density=density)
    # uncomment for interactive display
    # plt.show()

全部 最合适

该库还有一个单元测试,例如正态分布测试

def testNormal(self):
        '''
        test the normal distribution
        '''
        # use euler constant as seed
        np.random.seed(0)
        # statistically relevant number of datapoints
        datapoints=1000
        a = np.random.normal(40, 10, datapoints)
        df= pd.DataFrame({'nums':a})
        outputFilePrefix="/tmp/normalDist"
        bfd=BestFitDistribution(df,debug=True)
        bfd.analyze(title="normal distribution",x_label="x",y_label="random",outputFilePrefix=outputFilePrefix)

一切正常 正常最适合 如果您发现问题或需要改进的空间, 请将问题添加到项目中。 讨论也已启用。

下面的代码可能不是最新的,请使用 pypi 或 github 存储库获取最新版本。

'''
Created on 2022-05-17

see 
  https://stackoverflow.com/questions/6620471/fitting-empirical-distribution-to-theoretical-ones-with-scipy-python/37616966#37616966

@author: https://stackoverflow.com/users/832621/saullo-g-p-castro
see https://stackoverflow.com/a/37616966/1497139

@author: https://stackoverflow.com/users/2087463/tmthydvnprt
see  https://stackoverflow.com/a/37616966/1497139

@author: https://stackoverflow.com/users/1497139/wolfgang-fahl
see 

'''
import traceback
import sys
import warnings

import numpy as np
import pandas as pd
import scipy.stats as st

from scipy.stats._continuous_distns import _distn_names

import statsmodels.api as sm

import matplotlib
import matplotlib.pyplot as plt

class BestFitDistribution():
    '''
    Find the best Probability Distribution Function for the given data
    '''
    
    def __init__(self,data,distributionNames:list=None,debug:bool=False):
        '''
        constructor
        
        Args:
            data(dataFrame): the data to analyze
            distributionNames(list): list of distributionNames to try
            debug(bool): if True show debugging information
        '''
        self.debug=debug
        self.matplotLibParams()
        if distributionNames is None:
            self.distributionNames=[d for d in _distn_names if not d in ['levy_stable', 'studentized_range']]
        else:
            self.distributionNames=distributionNames
        self.data=data
        
    def matplotLibParams(self):
        '''
        set matplotlib parameters
        '''
        matplotlib.rcParams['figure.figsize'] = (16.0, 12.0)
        #matplotlib.style.use('ggplot')
        matplotlib.use("WebAgg")

    # Create models from data
    def best_fit_distribution(self,bins:int=200, ax=None,density:bool=True):
        """
        Model data by finding best fit distribution to data
        """
        # Get histogram of original data
        y, x = np.histogram(self.data, bins=bins, density=density)
        x = (x + np.roll(x, -1))[:-1] / 2.0
    
        # Best holders
        best_distributions = []
        distributionCount=len(self.distributionNames)
        # Estimate distribution parameters from data
        for ii, distributionName in enumerate(self.distributionNames):
    
            print(f"{ii+1:>3} / {distributionCount:<3}: {distributionName}")
    
            distribution = getattr(st, distributionName)
    
            # Try to fit the distribution
            try:
                # Ignore warnings from data that can't be fit
                with warnings.catch_warnings():
                    warnings.filterwarnings('ignore')
                    
                    # fit dist to data
                    params = distribution.fit(self.data)
    
                    # Separate parts of parameters
                    arg = params[:-2]
                    loc = params[-2]
                    scale = params[-1]
                    
                    # Calculate fitted PDF and error with fit in distribution
                    pdf = distribution.pdf(x, loc=loc, scale=scale, *arg)
                    sse = np.sum(np.power(y - pdf, 2.0))
                    
                    # if axis pass in add to plot
                    try:
                        if ax:
                            pd.Series(pdf, x).plot(ax=ax)
                    except Exception:
                        pass
    
                    # identify if this distribution is better
                    best_distributions.append((distribution, params, sse))
            
            except Exception as ex:
                if self.debug:
                    trace=traceback.format_exc()
                    msg=f"fit for {distributionName} failed:{ex}\n{trace}"
                    print(msg,file=sys.stderr)
                pass
        
        return sorted(best_distributions, key=lambda x:x[2])

    def make_pdf(self,dist, params:list, size=10000):
        """
        Generate distributions's Probability Distribution Function 
        
        Args:
            dist: Distribution
            params(list): parameter
            size(int): size
            
        Returns:
            dataframe: Power Distribution Function 
        
        """
    
        # Separate parts of parameters
        arg = params[:-2]
        loc = params[-2]
        scale = params[-1]
    
        # Get sane start and end points of distribution
        start = dist.ppf(0.01, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.01, loc=loc, scale=scale)
        end   = dist.ppf(0.99, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.99, loc=loc, scale=scale)
    
        # Build PDF and turn into pandas Series
        x = np.linspace(start, end, size)
        y = dist.pdf(x, loc=loc, scale=scale, *arg)
        pdf = pd.Series(y, x)
    
        return pdf
    
    def analyze(self,title,x_label,y_label,outputFilePrefix=None,imageFormat:str='png',allBins:int=50,distBins:int=200,density:bool=True):
        """
        
        analyze the Probabilty Distribution Function
        
        Args:
            data: Panda Dataframe or numpy array
            title(str): the title to use
            x_label(str): the label for the x-axis
            y_label(str): the label for the y-axis
            outputFilePrefix(str): the prefix of the outputFile
            imageFormat(str): imageFormat e.g. png,svg
            allBins(int): the number of bins for all
            distBins(int): the number of bins for the distribution
            density(bool): if True show relative density
        """
        self.allBins=allBins
        self.distBins=distBins
        self.density=density
        self.title=title
        self.x_label=x_label
        self.y_label=y_label
        self.imageFormat=imageFormat
        self.outputFilePrefix=outputFilePrefix
        self.color=list(matplotlib.rcParams['axes.prop_cycle'])[1]['color']
        self.best_dist=None
        self.analyzeAll()
        if outputFilePrefix is not None:
            self.saveFig(f"{outputFilePrefix}All.{imageFormat}", imageFormat)
            plt.close(self.figAll)
        if self.best_dist:
            self.analyzeBest()
            if outputFilePrefix is not None:
                self.saveFig(f"{outputFilePrefix}Best.{imageFormat}", imageFormat)
                plt.close(self.figBest)
            
    def analyzeAll(self):
        '''
        analyze the given data
        
        '''
        # Plot for comparison
        figTitle=f"{self.title}\n All Fitted Distributions"
        self.figAll=plt.figure(figTitle,figsize=(12,8))
        ax = self.data.plot(kind='hist', bins=self.allBins, density=self.density, alpha=0.5, color=self.color)
        
        # Save plot limits
        dataYLim = ax.get_ylim()
        # Update plots
        ax.set_ylim(dataYLim)
        ax.set_title(figTitle)
        ax.set_xlabel(self.x_label)
        ax.set_ylabel(self.y_label)
        
        # Find best fit distribution
        best_distributions = self.best_fit_distribution(bins=self.distBins, ax=ax,density=self.density)
        if len(best_distributions)>0:
            self.best_dist = best_distributions[0]
            # Make PDF with best params 
            self.pdf = self.make_pdf(self.best_dist[0], self.best_dist[1])
        
    def analyzeBest(self):
        '''
        analyze the Best Property Distribution function
        '''
        # Display
        figLabel="PDF"
        self.figBest=plt.figure(figLabel,figsize=(12,8))
        ax = self.pdf.plot(lw=2, label=figLabel, legend=True)
        self.data.plot(kind='hist', bins=self.allBins, density=self.density, alpha=0.5, label='Data', legend=True, ax=ax,color=self.color)
        
        param_names = (self.best_dist[0].shapes + ', loc, scale').split(', ') if self.best_dist[0].shapes else ['loc', 'scale']
        param_str = ', '.join(['{}={:0.2f}'.format(k,v) for k,v in zip(param_names, self.best_dist[1])])
        dist_str = '{}({})'.format(self.best_dist[0].name, param_str)
        
        ax.set_title(f'{self.title} with best fit distribution \n' + dist_str)
        ax.set_xlabel(self.x_label)
        ax.set_ylabel(self.y_label)

    def saveFig(self,outputFile:str=None,imageFormat='png'):
        '''
        save the current Figure to the given outputFile
        
        Args:
            outputFile(str): the outputFile to save to
            imageFormat(str): the imageFormat to use e.g. png/svg
        '''
        plt.savefig(outputFile, format=imageFormat) # dpi 
     

if __name__ == '__main__':
    # Load data from statsmodels datasets
    data = pd.Series(sm.datasets.elnino.load_pandas().data.set_index('YEAR').values.ravel())
    
    bfd=BestFitDistribution(data)
    for density in [True,False]:
        suffix="density" if density else ""
        bfd.analyze(title=u'El Niño sea temp.',x_label=u'Temp (°C)',y_label='Frequency',outputFilePrefix=f"/tmp/ElNinoPDF{suffix}",density=density)
    # uncomment for interactive display
    # plt.show()


暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM