繁体   English   中英

计算分布拟合函数 R 的 y 值

[英]Calculate y value for distribution fitting functions R

提示:本站提供中英文对照查看,鼠标放在中文字句上可显示英文原文。 若本文未解决您的问题,推荐您尝试使用帮您解决。

我正在为不同的分布函数绘制曲线,我需要知道每条曲线的最高 y 值。 稍后我将 plot 仅选择一条曲线作为最佳拟合。

这是 function (它有点硬编码,我正在处理它):

library(plyr)
library(dplyr)
library(fitdistrplus)
library(evd)
library(gamlss)
        
        
fdistr <- function(d) {
  
  #  Uncomment to try  run line by line
  # d <- data_to_plot
  
  TLT <- d$TLT
  if (sum(TLT<=0)) {TLT[TLT<=0] <- 0.001} # removing value < 0 for log clculation
  gev <- fgev(TLT, std.err=FALSE)
  distr <- c('norm', 'lnorm', 'weibull', 'gamma')
  fit <- lapply(X=distr, FUN=fitdist, data=TLT)
  fit[[5]] <- gev
  distr[5] <- 'gev'
  names(fit) <- distr
  Loglike <- sapply(X=fit, FUN=logLik)
  Loglike_Best <- which(Loglike == max(Loglike))
  
  #  Uncomment to try  run line by line
  # max <- which.max(density(d$TLT)$y)
  # max_density <- stats::density(d$TLT)$y[max]
  # max_y <- max_density
  
  x_data <- max(d$TLT)
  
  hist(TLT, prob=TRUE, breaks= x_data,
       main=paste(d$DLT_Code[1], 
                  '- best :',
                  names(Loglike[Loglike_Best])),
       sub = 'Total Lead Times',
       col='lightgrey',
       border='white'
       # ylim=  c(0,max_y)
  )
  
  lines(density(TLT),
        col='darkgrey',
        lty=2,
        lwd=2)
  
  grid(nx = NA, ny = NULL, col = "gray", lty = "dotted",
       lwd = .5, equilogs = TRUE)
  
  curve(dnorm(x, 
              mean=fit[['norm']]$estimate[1], 
              sd=fit[['norm']]$estimate[2]), 
        add=TRUE, col='blue', lwd=2)
  
  curve(dlnorm(x, 
               meanlog=fit[['lnorm']]$estimate[1], 
               sdlog=fit[['lnorm']]$estimate[2]), 
        add=TRUE, col='darkgreen', lwd=2)
  
  curve(dweibull(x, 
                 shape=fit[['weibull']]$estimate[1], 
                 scale=fit[['weibull']]$estimate[2]), 
        add=TRUE, col='purple', lwd=2)
  
  curve(dgamma(x, 
               shape=fit[['gamma']]$estimate[1], 
               rate=fit[['gamma']]$estimate[2]), 
        add=TRUE, col='Gold', lwd=2)
  
  
  curve(dgev(x, 
             loc=fit[['gev']]$estimate[1],
             scale=fit[['gev']]$estimate[2], 
             shape=fit[['gev']]$estimate[3]), 
        add=TRUE, col='red', lwd=2)
  
  
  legend_loglik <- paste(c('Norm', 'LogNorm', 'Weibull', 'Gamma','GEV'), c(':'),
                         round(Loglike, digits=2))
  
  legend("topright", legend=legend_loglik, 
         col=c('blue', 'darkgreen', 'purple', 'gold', 'red'),
         lty=1, lwd=2,
         bty='o', bg='white', box.lty=2, box.lwd = 1, box.col='white')  
  
  return(data.frame(DLT_Code = d$DLT_Code[1],
                    n = length(d$TLT),
                    Best = names(Loglike[Loglike_Best]),
                    lnorm = Loglike[1],
                    norm = Loglike[2],
                    weibul = Loglike[3],
                    gamma = Loglike[4],
                    GEV = Loglike[5]))
  
}



#  Creating data set
TLT <- c(rep(0,32), rep(1,120), rep(2,10), rep(3,67), rep(4,14),  rep(5,7), 6)
DLT_Code <- c(rep('DLT_Code',251))

data_to_plot <- data.frame(cbind(DLT_Code,TLT))
data_to_plot$TLT <- as.numeric(as.character(data_to_plot$TLT ))


DLT_Distr <- do.call(rbind, by(data = data_to_plot, INDICES = data_to_plot$DLT_Code, FUN=fdistr))

我试图玩max_y然后在ylim中使用它。 我只能为正常密度做它,但不能用于 rest 曲线。

目前 plot 看起来像这样(一些曲线被剪掉了):

在此处输入图像描述

如果设置ylim = c(0,2)我们可以看到,对数正态分布和伽马分布超过 1:

在此处输入图像描述

我需要知道每条曲线的最大值,因此,当我选择要打印的曲线时,设置正确的ylim

您可以使用purrr::map_dbl到 map function optimize您的密度,如果您稍微重新排列您的代码并且您知道您想找到哪些输入值它们的最大值/存在密度。

您可以提前使用任何参数设置密度,这样您就可以使用optimize找到它们的峰值,并将它们传递给curve function。

作为一个可重复的小例子:

library(purrr)

# parameterize your densities
mynorm <- function(x) dnorm(x, mean = 0, sd = 1) 
mygamma <- function(x) dgamma(x, rate = .5, shape = 1) 

# get largest maximum over interval
ymax <- max(purrr::map_dbl(c(mynorm, mygamma), ~ optimize(., interval = c(0, 3), maximum = T)$objective))

# 0.4999811

# plot data
curve(mynorm, col = "blue", lwd = 2, xlim = c(0, 3), ylim = c(0, ymax * 1.1))
curve(mygamma, col = "red", lwd = 2, add = T)

使用您的代码,我已经实现了上述解决方案并调整了curve function 的x网格,以在我们在评论中讨论后向您展示我的意思,以使事情更清楚并向您展示您应该实际绘制的内容:

library(plyr)
library(dplyr)
library(fitdistrplus)
library(evd)
library(gamlss)
library(purrr) # <- add this library


fdistr <- function(d) {
  
  #  Uncomment to try  run line by line
  # d <- data_to_plot
  
  TLT <- d$TLT
  if (sum(TLT<=0)) {TLT[TLT<=0] <- 0.001} # removing value < 0 for log clculation
  gev <- fgev(TLT, std.err=FALSE)
  distr <- c('norm', 'lnorm', 'weibull', 'gamma')
  fit <- lapply(X=distr, FUN=fitdist, data=TLT)
  fit[[5]] <- gev
  distr[5] <- 'gev'
  names(fit) <- distr
  Loglike <- sapply(X=fit, FUN=logLik)
  Loglike_Best <- which(Loglike == max(Loglike))
  
  #  Uncomment to try  run line by line
  # max <- which.max(density(d$TLT)$y)
  # max_density <- stats::density(d$TLT)$y[max]
  # max_y <- max_density
  
  x_data <- max(d$TLT)
  
  # parameterize your densities before plotting
  mynorm <- function(x) {
    dnorm(x, 
          mean=fit[['norm']]$estimate[1], 
          sd=fit[['norm']]$estimate[2])
  }
  
  mylnorm <- function(x){
    dlnorm(x, 
           meanlog=fit[['lnorm']]$estimate[1], 
           sdlog=fit[['lnorm']]$estimate[2])
  }
  
  myweibull <- function(x) {
    dweibull(x, 
             shape=fit[['weibull']]$estimate[1], 
             scale=fit[['weibull']]$estimate[2])
  }
  
  mygamma <- function(x) {
    dgamma(x, 
           shape=fit[['gamma']]$estimate[1], 
           rate=fit[['gamma']]$estimate[2])
  }
  
  mygev <- function(x){
    dgev(x, 
         loc=fit[['gev']]$estimate[1],
         scale=fit[['gev']]$estimate[2], 
         shape=fit[['gev']]$estimate[3])
  }
  
  distributions <- c(mynorm, mylnorm, myweibull, mygamma, mygev)
  
  # get the max of each density
  y <- purrr::map_dbl(distributions, ~ optimize(., interval = c(0, x_data), maximum = T)$objective)

  # find the max (excluding infinity)
  ymax <- max(y[abs(y) < Inf])
  
  
  hist(TLT, prob=TRUE, breaks= x_data,
       main=paste(d$DLT_Code[1], 
                  '- best :',
                  names(Loglike[Loglike_Best])),
       sub = 'Total Lead Times',
       col='lightgrey',
       border='white',
       ylim=  c(0, ymax)
  )
  
  lines(density(TLT),
        col='darkgrey',
        lty=2,
        lwd=2)
  
  grid(nx = NA, ny = NULL, col = "gray", lty = "dotted",
       lwd = .5, equilogs = TRUE)
  
  curve(mynorm, 
        add=TRUE, col='blue', lwd=2, n = 1E5) # <- increase x grid
  
  curve(mylnorm, 
        add=TRUE, col='darkgreen', lwd=2, n = 1E5) # <- increase x grid
  
  curve(myweibull, 
        add=TRUE, col='purple', lwd=2, n = 1E5) # <- increase x grid
  
  curve(mygamma, 
        add=TRUE, col='Gold', lwd=2, n = 1E5) # <- increase x grid
  
  
  curve(mygev, 
        add=TRUE, col='red', lwd=2, n = 1E5) # <- increase x grid
  
  
  legend_loglik <- paste(c('Norm', 'LogNorm', 'Weibull', 'Gamma','GEV'), c(':'),
                         round(Loglike, digits=2))
  
  legend("topright", legend=legend_loglik, 
         col=c('blue', 'darkgreen', 'purple', 'gold', 'red'),
         lty=1, lwd=2,
         bty='o', bg='white', box.lty=2, box.lwd = 1, box.col='white')  
  
  return(data.frame(DLT_Code = d$DLT_Code[1],
                    n = length(d$TLT),
                    Best = names(Loglike[Loglike_Best]),
                    lnorm = Loglike[1],
                    norm = Loglike[2],
                    weibul = Loglike[3],
                    gamma = Loglike[4],
                    GEV = Loglike[5]))
  
}



#  Creating data set
TLT <- c(rep(0,32), rep(1,120), rep(2,10), rep(3,67), rep(4,14),  rep(5,7), 6)
DLT_Code <- c(rep('DLT_Code',251))

data_to_plot <- data.frame(cbind(DLT_Code,TLT))
data_to_plot$TLT <- as.numeric(as.character(data_to_plot$TLT ))


DLT_Distr <- do.call(rbind, by(data = data_to_plot, INDICES = data_to_plot$DLT_Code, FUN=fdistr))

在此处输入图像描述


为什么您的 plot 高度与解决方案 output 不匹配

为了进一步说明 plot 的情况以及您可能遇到的一些困惑,您需要了解curve function 是如何绘制数据的。 默认情况下, curve采用 101 个 x 值并评估您的函数以获得它们的 y 值,然后将这些点绘制为一条线。 因为您的某些密度的峰值非常尖锐, curve function 没有评估足够的 x 值来 plot 您的密度峰值。 为了表明你想要我的意思是我将专注于你的伽马密度。 不用像 output 那样太担心代码。 下面我有n的不同值的前几个 (x,y) 坐标。

library(purrr)

mygamma <- function(x) {
  dgamma(x, 
         shape=fit[['gamma']]$estimate[1], # 0.6225622
         rate=fit[['gamma']]$estimate[2]) # 0.3568242
}

number_of_x <- c(5, 10, 101, 75000)
purrr::imap_dfr(number_of_x, ~ curve(mygamma, xlim = c(0, 6), n = .), .id = "n") %>% 
  dplyr::mutate_at(1, ~ sprintf("n = %i", number_of_x[as.numeric(.)])) %>% 
  dplyr::mutate(n = factor(n, unique(n))) %>% 
  dplyr::filter(x > 0) %>% 
  dplyr::group_by(n) %>% 
  dplyr::slice_min(order_by = x, n = 5)

 n                 x       y
   <fct>         <dbl>   <dbl>
 1 n = 5     1.5        0.184 
 2 n = 5     3          0.0828
 3 n = 5     4.5        0.0416
 4 n = 5     6          0.0219
 5 n = 10    0.667      0.336 
 6 n = 10    1.33       0.204 
 7 n = 10    2          0.138 
 8 n = 10    2.67       0.0975
 9 n = 10    3.33       0.0707
10 n = 101   0.06       1.04  
11 n = 101   0.12       0.780 
12 n = 101   0.18       0.655 
13 n = 101   0.24       0.575 
14 n = 101   0.3        0.518 
15 n = 75000 0.0000800 12.9   
16 n = 75000 0.000160   9.90  
17 n = 75000 0.000240   8.50  
18 n = 75000 0.000320   7.62  
19 n = 75000 0.000400   7.01  

请注意,当n = 5时,您绘制的值非常少。 随着n的增加,x 值之间的距离变小。 由于这些函数是连续的,因此 plot 的点数是无限的,但这无法通过计算完成,因此绘制了 x 值的子集以进行近似。 x 值越多,近似值就越好。 通常,默认的n = 101可以正常工作,但由于伽马和对数正态密度具有如此尖锐的峰值,plot function 正在超过最大值。 下面是完整的 plot 数据, n = 5, 10, 101, 75000 5、10、101、75000 并添加了点。

![在此处输入图像描述

最后我使用了这个解决方案,在这里找到:

mygamma <- function(x) dgamma(x, shape=fit[['gamma']]$estimate[1], 
                                  rate=fit[['gamma']]$estimate[2]) 
get_curve_values <- function(fn, x_data){
res <- curve(fn, from=0, to=x_data)
dev.off()
res
}
curve_val <- get_curve_values(mygamma, x_data)
ylim <- max(curve_val$y,na.rm = TRUE)

问题未解决?试试使用:帮您解决问题。
暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2023 STACKOOM.COM