
[英]How to calculate the probability of a datapoint fitting in a known distribution in R
[英]Calculate y value for distribution fitting functions R
我正在为不同的分布函数绘制曲线,我需要知道每条曲线的最高 y 值。 稍后我将 plot 仅选择一条曲线作为最佳拟合。
这是 function (它有点硬编码,我正在处理它):
library(plyr)
library(dplyr)
library(fitdistrplus)
library(evd)
library(gamlss)
fdistr <- function(d) {
# Uncomment to try run line by line
# d <- data_to_plot
TLT <- d$TLT
if (sum(TLT<=0)) {TLT[TLT<=0] <- 0.001} # removing value < 0 for log clculation
gev <- fgev(TLT, std.err=FALSE)
distr <- c('norm', 'lnorm', 'weibull', 'gamma')
fit <- lapply(X=distr, FUN=fitdist, data=TLT)
fit[[5]] <- gev
distr[5] <- 'gev'
names(fit) <- distr
Loglike <- sapply(X=fit, FUN=logLik)
Loglike_Best <- which(Loglike == max(Loglike))
# Uncomment to try run line by line
# max <- which.max(density(d$TLT)$y)
# max_density <- stats::density(d$TLT)$y[max]
# max_y <- max_density
x_data <- max(d$TLT)
hist(TLT, prob=TRUE, breaks= x_data,
main=paste(d$DLT_Code[1],
'- best :',
names(Loglike[Loglike_Best])),
sub = 'Total Lead Times',
col='lightgrey',
border='white'
# ylim= c(0,max_y)
)
lines(density(TLT),
col='darkgrey',
lty=2,
lwd=2)
grid(nx = NA, ny = NULL, col = "gray", lty = "dotted",
lwd = .5, equilogs = TRUE)
curve(dnorm(x,
mean=fit[['norm']]$estimate[1],
sd=fit[['norm']]$estimate[2]),
add=TRUE, col='blue', lwd=2)
curve(dlnorm(x,
meanlog=fit[['lnorm']]$estimate[1],
sdlog=fit[['lnorm']]$estimate[2]),
add=TRUE, col='darkgreen', lwd=2)
curve(dweibull(x,
shape=fit[['weibull']]$estimate[1],
scale=fit[['weibull']]$estimate[2]),
add=TRUE, col='purple', lwd=2)
curve(dgamma(x,
shape=fit[['gamma']]$estimate[1],
rate=fit[['gamma']]$estimate[2]),
add=TRUE, col='Gold', lwd=2)
curve(dgev(x,
loc=fit[['gev']]$estimate[1],
scale=fit[['gev']]$estimate[2],
shape=fit[['gev']]$estimate[3]),
add=TRUE, col='red', lwd=2)
legend_loglik <- paste(c('Norm', 'LogNorm', 'Weibull', 'Gamma','GEV'), c(':'),
round(Loglike, digits=2))
legend("topright", legend=legend_loglik,
col=c('blue', 'darkgreen', 'purple', 'gold', 'red'),
lty=1, lwd=2,
bty='o', bg='white', box.lty=2, box.lwd = 1, box.col='white')
return(data.frame(DLT_Code = d$DLT_Code[1],
n = length(d$TLT),
Best = names(Loglike[Loglike_Best]),
lnorm = Loglike[1],
norm = Loglike[2],
weibul = Loglike[3],
gamma = Loglike[4],
GEV = Loglike[5]))
}
# Creating data set
TLT <- c(rep(0,32), rep(1,120), rep(2,10), rep(3,67), rep(4,14), rep(5,7), 6)
DLT_Code <- c(rep('DLT_Code',251))
data_to_plot <- data.frame(cbind(DLT_Code,TLT))
data_to_plot$TLT <- as.numeric(as.character(data_to_plot$TLT ))
DLT_Distr <- do.call(rbind, by(data = data_to_plot, INDICES = data_to_plot$DLT_Code, FUN=fdistr))
我试图玩max_y
然后在ylim
中使用它。 我只能为正常密度做它,但不能用于 rest 曲线。
目前 plot 看起来像这样(一些曲线被剪掉了):
如果设置ylim = c(0,2)
我们可以看到,对数正态分布和伽马分布超过 1:
我需要知道每条曲线的最大值,因此,当我选择要打印的曲线时,设置正确的ylim
。
您可以使用purrr::map_dbl
到 map function optimize
您的密度,如果您稍微重新排列您的代码并且您知道您想找到哪些输入值它们的最大值/存在密度。
您可以提前使用任何参数设置密度,这样您就可以使用optimize
找到它们的峰值,并将它们传递给curve
function。
作为一个可重复的小例子:
library(purrr)
# parameterize your densities
mynorm <- function(x) dnorm(x, mean = 0, sd = 1)
mygamma <- function(x) dgamma(x, rate = .5, shape = 1)
# get largest maximum over interval
ymax <- max(purrr::map_dbl(c(mynorm, mygamma), ~ optimize(., interval = c(0, 3), maximum = T)$objective))
# 0.4999811
# plot data
curve(mynorm, col = "blue", lwd = 2, xlim = c(0, 3), ylim = c(0, ymax * 1.1))
curve(mygamma, col = "red", lwd = 2, add = T)
使用您的代码,我已经实现了上述解决方案并调整了curve
function 的x
网格,以在我们在评论中讨论后向您展示我的意思,以使事情更清楚并向您展示您应该实际绘制的内容:
library(plyr)
library(dplyr)
library(fitdistrplus)
library(evd)
library(gamlss)
library(purrr) # <- add this library
fdistr <- function(d) {
# Uncomment to try run line by line
# d <- data_to_plot
TLT <- d$TLT
if (sum(TLT<=0)) {TLT[TLT<=0] <- 0.001} # removing value < 0 for log clculation
gev <- fgev(TLT, std.err=FALSE)
distr <- c('norm', 'lnorm', 'weibull', 'gamma')
fit <- lapply(X=distr, FUN=fitdist, data=TLT)
fit[[5]] <- gev
distr[5] <- 'gev'
names(fit) <- distr
Loglike <- sapply(X=fit, FUN=logLik)
Loglike_Best <- which(Loglike == max(Loglike))
# Uncomment to try run line by line
# max <- which.max(density(d$TLT)$y)
# max_density <- stats::density(d$TLT)$y[max]
# max_y <- max_density
x_data <- max(d$TLT)
# parameterize your densities before plotting
mynorm <- function(x) {
dnorm(x,
mean=fit[['norm']]$estimate[1],
sd=fit[['norm']]$estimate[2])
}
mylnorm <- function(x){
dlnorm(x,
meanlog=fit[['lnorm']]$estimate[1],
sdlog=fit[['lnorm']]$estimate[2])
}
myweibull <- function(x) {
dweibull(x,
shape=fit[['weibull']]$estimate[1],
scale=fit[['weibull']]$estimate[2])
}
mygamma <- function(x) {
dgamma(x,
shape=fit[['gamma']]$estimate[1],
rate=fit[['gamma']]$estimate[2])
}
mygev <- function(x){
dgev(x,
loc=fit[['gev']]$estimate[1],
scale=fit[['gev']]$estimate[2],
shape=fit[['gev']]$estimate[3])
}
distributions <- c(mynorm, mylnorm, myweibull, mygamma, mygev)
# get the max of each density
y <- purrr::map_dbl(distributions, ~ optimize(., interval = c(0, x_data), maximum = T)$objective)
# find the max (excluding infinity)
ymax <- max(y[abs(y) < Inf])
hist(TLT, prob=TRUE, breaks= x_data,
main=paste(d$DLT_Code[1],
'- best :',
names(Loglike[Loglike_Best])),
sub = 'Total Lead Times',
col='lightgrey',
border='white',
ylim= c(0, ymax)
)
lines(density(TLT),
col='darkgrey',
lty=2,
lwd=2)
grid(nx = NA, ny = NULL, col = "gray", lty = "dotted",
lwd = .5, equilogs = TRUE)
curve(mynorm,
add=TRUE, col='blue', lwd=2, n = 1E5) # <- increase x grid
curve(mylnorm,
add=TRUE, col='darkgreen', lwd=2, n = 1E5) # <- increase x grid
curve(myweibull,
add=TRUE, col='purple', lwd=2, n = 1E5) # <- increase x grid
curve(mygamma,
add=TRUE, col='Gold', lwd=2, n = 1E5) # <- increase x grid
curve(mygev,
add=TRUE, col='red', lwd=2, n = 1E5) # <- increase x grid
legend_loglik <- paste(c('Norm', 'LogNorm', 'Weibull', 'Gamma','GEV'), c(':'),
round(Loglike, digits=2))
legend("topright", legend=legend_loglik,
col=c('blue', 'darkgreen', 'purple', 'gold', 'red'),
lty=1, lwd=2,
bty='o', bg='white', box.lty=2, box.lwd = 1, box.col='white')
return(data.frame(DLT_Code = d$DLT_Code[1],
n = length(d$TLT),
Best = names(Loglike[Loglike_Best]),
lnorm = Loglike[1],
norm = Loglike[2],
weibul = Loglike[3],
gamma = Loglike[4],
GEV = Loglike[5]))
}
# Creating data set
TLT <- c(rep(0,32), rep(1,120), rep(2,10), rep(3,67), rep(4,14), rep(5,7), 6)
DLT_Code <- c(rep('DLT_Code',251))
data_to_plot <- data.frame(cbind(DLT_Code,TLT))
data_to_plot$TLT <- as.numeric(as.character(data_to_plot$TLT ))
DLT_Distr <- do.call(rbind, by(data = data_to_plot, INDICES = data_to_plot$DLT_Code, FUN=fdistr))
为什么您的 plot 高度与解决方案 output 不匹配
为了进一步说明 plot 的情况以及您可能遇到的一些困惑,您需要了解curve
function 是如何绘制数据的。 默认情况下, curve
采用 101 个 x 值并评估您的函数以获得它们的 y 值,然后将这些点绘制为一条线。 因为您的某些密度的峰值非常尖锐, curve
function 没有评估足够的 x 值来 plot 您的密度峰值。 为了表明你想要我的意思是我将专注于你的伽马密度。 不用像 output 那样太担心代码。 下面我有n
的不同值的前几个 (x,y) 坐标。
library(purrr)
mygamma <- function(x) {
dgamma(x,
shape=fit[['gamma']]$estimate[1], # 0.6225622
rate=fit[['gamma']]$estimate[2]) # 0.3568242
}
number_of_x <- c(5, 10, 101, 75000)
purrr::imap_dfr(number_of_x, ~ curve(mygamma, xlim = c(0, 6), n = .), .id = "n") %>%
dplyr::mutate_at(1, ~ sprintf("n = %i", number_of_x[as.numeric(.)])) %>%
dplyr::mutate(n = factor(n, unique(n))) %>%
dplyr::filter(x > 0) %>%
dplyr::group_by(n) %>%
dplyr::slice_min(order_by = x, n = 5)
n x y
<fct> <dbl> <dbl>
1 n = 5 1.5 0.184
2 n = 5 3 0.0828
3 n = 5 4.5 0.0416
4 n = 5 6 0.0219
5 n = 10 0.667 0.336
6 n = 10 1.33 0.204
7 n = 10 2 0.138
8 n = 10 2.67 0.0975
9 n = 10 3.33 0.0707
10 n = 101 0.06 1.04
11 n = 101 0.12 0.780
12 n = 101 0.18 0.655
13 n = 101 0.24 0.575
14 n = 101 0.3 0.518
15 n = 75000 0.0000800 12.9
16 n = 75000 0.000160 9.90
17 n = 75000 0.000240 8.50
18 n = 75000 0.000320 7.62
19 n = 75000 0.000400 7.01
请注意,当n = 5
时,您绘制的值非常少。 随着n
的增加,x 值之间的距离变小。 由于这些函数是连续的,因此 plot 的点数是无限的,但这无法通过计算完成,因此绘制了 x 值的子集以进行近似。 x 值越多,近似值就越好。 通常,默认的n = 101
可以正常工作,但由于伽马和对数正态密度具有如此尖锐的峰值,plot function 正在超过最大值。 下面是完整的 plot 数据, n = 5, 10, 101, 75000
5、10、101、75000 并添加了点。
最后我使用了这个解决方案,在这里找到:
mygamma <- function(x) dgamma(x, shape=fit[['gamma']]$estimate[1],
rate=fit[['gamma']]$estimate[2])
get_curve_values <- function(fn, x_data){
res <- curve(fn, from=0, to=x_data)
dev.off()
res
}
curve_val <- get_curve_values(mygamma, x_data)
ylim <- max(curve_val$y,na.rm = TRUE)
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.