如何創建一個 for 循環來遍歷 R 中 glm 的多年組合？

Question

我有一個大型數據集，其中包含 Blue Rockfish 的存在和不存在 (0,1) 和多個變量（在我的例子中，是測深、曲率、東度、細尺度 BPI 和大尺度 BPI）。

structure(list(Pres_Abs = c(1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 
0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L), CommonName = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), .Label = "Blue Rockfish", class = "factor"), Survey_Yea = c(2009L, 
2014L, 2005L, 2015L, 2006L, 2009L, 2014L, 2015L, 2015L, 2015L, 
2005L, 2014L, 2015L, 2015L, 2014L, 2015L, 2015L, 2015L, 2015L, 
2006L), ca_10mbath = c(-42.6913986, -36.0038986, -36.5155983, 
-44.4014816, -39.3320007, -45.7226982, -47.9375, -51.5976982, 
-37.7812996, -14.1093302, -70.5976562, -41.5121307, -48.4246902, 
-46.0937996, -38.9961014, -46.375, -42.6913986, -60.96875, -46.375, 
-37.6601601), ca10_bpi24 = c(-12L, -2L, -2L, -2L, -2L, -2L, 7L, 
37L, -2L, 77L, -2L, -2L, 17L, 7L, -2L, -2L, -12L, -2L, -2L, 67L
), ca_10m_cur = c(-0.0859528, -0.0006409, -0.0068855, -0.5154228, 
-0.0390663, -0.0078316, -0.0221901, 0.792961, 0, 4.3303394, 0.0429688, 
-0.4405556, -0.1947556, 0.0195274, -0.230453, -0.0093803, -0.0859528, 
-0.2148438, -0.0093803, 0.0976486), ca_10m_eas = c(0.727106, 
0.887252, 0.565906, 0.9994883, 0.96552, 0.960033, 0.998732, 0.772206, 
0.589553, -0.4134142, -0.8266082, -0.3659272, -0.7330094, 0.0329623, 
0.998884, 0.271237, 0.727106, -0.5498384, 0.271237, 0.6424425
), ca10_bpi30 = c(-15L, -15L, -15L, -15L, -15L, -15L, -15L, -15L, 
-15L, 262L, -15L, -15L, -15L, -15L, -15L, -15L, -15L, -15L, -15L, 
-15L)), row.names = c(2032L, 3801L, 479L, 4421L, 997L, 1551L, 
3079L, 4657L, 5059L, 4104L, 261L, 2849L, 4460L, 4765L, 3535L, 
4842L, 4950L, 4323L, 4833L, 752L), class = "data.frame")

另外，我有多年的數據（2005、2006、2007、2009、2014、2015）。 我基本上是想創建一個 glm

Pres_Abs~bathy+curvature+eastness+broadscale+finescale, data=Blue_allyears, family=binomial(link=logit))

這經歷了多年的每一個組合。 因此，在 1 年級別上，我使用 2005 年的數據、2006 年的數據、2007 年的數據等創建了 glms。在該代碼中，我保存了 AIC、殘差和空偏差、卡方、p 等數據價值等。

這是我用來循環第一年數據的代碼（改編自 stackoverflow 上的其他人）：

results <- data.frame()


for(Survey_Yea in unique(Blue_allyears$Survey_Yea)){
  
  # dynamically generate formula
        fmla <- as.formula(Pres_Abs~ca_10mbath+ca_10m_cur+ca_10m_eas+ca10_bpi30+ca10_bpi24)

        # fit glm model
        fit<-glm(fmla,data=Blue_allyears[Blue_allyears$Survey_Yea == Survey_Yea,],family=binomial(link=logit))

        ## capture summary stats
        
        AIC <- AIC(fit)
        Deviance <- deviance(fit)
        NullDeviance <- fit$null.deviance
        null_minus_dev<-NullDeviance-Deviance
        df.residual<- fit$df.residual      
        df.null<-fit$df.null 
        df.null.minus.df.residual<-df.null-df.residual
       pvalue<- with(fit, 1-pchisq(null_minus_dev ,  df.null.minus.df.residual))
       Years<-"1"
   

        # get coefficents of fit
        cfit <- coef(summary(fit))

        # create temporary data frame
        df <- data.frame( Survey_Yea = Survey_Yea, 
                         AIC = AIC(fit), Deviance = deviance(fit),NullDeviance = fit$null.deviance, null.minus.dev=NullDeviance-Deviance, df.residual= fit$df.residual ,   df.null=fit$df.null , df.null.minus.df.residual=df.null-df.residual,  pvalue= pvalue,  Years="1", stringsAsFactors = F)

        # bind rows of temporary data frame to the results data frame
        results <- rbind(results, df)
}


results

這段代碼很棒，並且從每年的數據中創建了 glms。

structure(list(Survey_Yea = c(2005L, 2006L, 2007L, 2009L, 2014L, 
2015L), AIC = c(731.84838805646, 480.699964265887, 113.681123536743, 
764.359566454308, 1482.05275641814, 1581.2853892652), Deviance = c(719.84838805646, 
468.699964265887, 101.681123536743, 752.359566454308, 1470.05275641814, 
1569.2853892652), NullDeviance = c(987.041585117362, 690.374591837705, 
174.673089501106, 1059.1288918956, 2412.15218834861, 2012.89941234608
), null.minus.dev = c(267.193197060902, 221.674627571818, 72.991965964363, 
306.769325441288, 942.099431930472, 443.614023080884), df.residual = c(706L, 
492L, 120L, 758L, 1734L, 1446L), df.null = c(711L, 497L, 125L, 
763L, 1739L, 1451L), df.null.minus.df.residual = c(5L, 5L, 5L, 
5L, 5L, 5L), pvalue = c(0, 0, 2.44249065417534e-14, 0, 0, 0), 
    Years = c("1", "1", "1", "1", "1", "1")), row.names = c(NA, 
-6L), class = "data.frame")

現在，我想通過兩年的數據創建 glms 並提取相關數據。 因此，例如，年份迭代將是： 2005 和 2006 2005 和 2007 2005 和 2009 2005 和 2014 2005 和 2015 2006 和 2007 2006 和 2009 等...... 2014 和 2015

在用兩年的數據完成此操作后，我想用三年的數據進行每一次組合，等等，直到我開始使用所有年份的數據。

我一直在嘗試添加另一個 for 循環或將 combn() 添加到我的代碼中，但沒有運氣。

任何幫助將非常感激！

另外，這是我第一次發帖，如果您需要更多數據，請告訴我。 謝謝！

Answer 1

考慮將所有處理封裝在一個定義的方法中，在該方法中您接收年數和年數的組合向量作為參數。 然后，使用lapply + combn進行迭代。

功能

run_model <- function(vec, yr) {
  # subset data by years
  sub <- Blue_allyears[Blue_allyears$Survey_Yea %in% vec,]
  
  # dynamically generate formula
  fmla <- Pres_Abs ~ ca_10mbath+ca_10m_cur+ca_10m_eas+ca10_bpi30+ca10_bpi24
  
  # fit glm model
  fit <- glm(fmla, data=sub, family=binomial(link=logit))
  
  ## capture summary stats
  AIC <- AIC(fit)
  Deviance <- deviance(fit)
  NullDeviance <- fit$null.deviance
  null_minus_dev <- NullDeviance - Deviance
  df.residual <- fit$df.residual      
  df.null <- fit$df.null 
  df.null.minus.df.residual <- df.null - df.residual
  pvalue <- 1 - pchisq(null_minus_dev,  df.null.minus.df.residual)
  
  # get coefficents of fit
  cfit <- coef(summary(fit))
  
  # create temporary data frame
  df <- data.frame(
    Survey_Yea = paste(vec, collapse=", "), 
    AIC = AIC,
    Deviance = Deviance,
    NullDeviance = NullDeviance, 
    null.minus.dev = null_minus_dev,
    df.residual = df.residual,   
    df.null = df.null, 
    df.null.minus.df.residual = df.null.minus.df.residual,
    pvalue = pvalue, 
    Years = yr, 
    stringsAsFactors = FALSE   # DEFAULT IN R 1.4.0+
  )
  
  return(df)
}

稱呼

years <- sort(unique(Blue_allyears$Survey_Yea))

# RETURN NESTED LIST OF MANY DATA FRAMES
results_df_list <- lapply(1:3, function(i) combn(
  years, i, run_model, simplify=FALSE, num_yr=i)
)

# RETURN FLATTENED LIST OF THREE DATA FRAMES AND
# RENAME ELEMENTS
results_df_list <- setNames(
  lapply(results_df_list, function(dfs) do.call(rbind, dfs)),
  c("years_1", "years_2", "years_3")
)

# REVIEW EMBEDDED DATA FRAMES
View(results_df_list$years_1)
View(results_df_list$years_2)
View(results_df_list$years_3)

演示

用 OP 的截圖圖像的隨機數據匹配結構來演示：

set.seed(52222)
Blue_allyears <- data.frame(
  Survey_Yea = sample(2005:2014, 500, replace=TRUE),
  Pres_Abs = sample(0:1, 500, replace=TRUE),
  ca_10mbath = runif(500),
  ca_10m_cur = runif(500),
  ca_10m_eas = runif(500),
  ca10_bpi30 = runif(500),
  ca10_bpi24 = runif(500)
)

#...run above blocks...

head(results_df_list$years_1)
#   Survey_Yea      AIC Deviance NullDeviance null.minus.dev df.residual df.null df.null.minus.df.residual     pvalue Years
# 1       2005 83.68461 71.68461     81.77442      10.089809          53      58                         5 0.07273019     1
# 2       2006 68.09388 56.09388     60.28383       4.189951          41      46                         5 0.52240456     1
# 3       2007 69.25363 57.25363     62.18310       4.929472          39      44                         5 0.42454811     1
# 4       2008 79.01764 67.01764     70.52444       3.506803          45      50                         5 0.62235846     1
# 5       2009 81.57290 69.57290     74.19185       4.618955          48      53                         5 0.46412711     1
# 6       2010 85.46602 73.46602     76.88259       3.416573          51      56                         5 0.63604708     1

head(results_df_list$years_2)
#   Survey_Yea      AIC Deviance NullDeviance null.minus.dev df.residual df.null df.null.minus.df.residual    pvalue Years
# 1 2005, 2006 152.5382 140.5382     145.0927       4.554509         100     105                         5 0.4726236     2
# 2 2005, 2007 153.2814 141.2814     144.0207       2.739315          98     103                         5 0.7400991     2
# 3 2005, 2008 159.2930 147.2930     152.3469       5.053860         104     109                         5 0.4093425     2
# 4 2005, 2009 160.5739 148.5739     156.2174       7.643473         107     112                         5 0.1770101     2
# 5 2005, 2010 167.3905 155.3905     159.5665       4.176056         110     115                         5 0.5243568     2
# 6 2005, 2011 153.0582 141.0582     145.5514       4.493158          99     104                         5 0.4807993     2

head(results_df_list$years_3)
#         Survey_Yea      AIC Deviance NullDeviance null.minus.dev df.residual df.null df.null.minus.df.residual    pvalue Years
# 1 2005, 2006, 2007 219.1731 207.1731     208.5284       1.355302         145     150                         5 0.9291396     3
# 2 2005, 2006, 2008 225.7515 213.7515     216.8769       3.125365         151     156                         5 0.6806653     3
# 3 2005, 2006, 2009 228.9630 216.9630     221.4069       4.443965         154     159                         5 0.4874155     3
# 4 2005, 2006, 2010 235.7721 223.7721     225.9108       2.138620         157     162                         5 0.8296509     3
# 5 2005, 2006, 2011 218.5088 206.5088     209.4254       2.916605         146     151                         5 0.7128412     3
# 6 2005, 2006, 2012 213.4275 201.4275     210.2102       8.782750         147     152                         5 0.1180497     3

如何創建一個 for 循環來遍歷 R 中 glm 的多年組合？

問題描述

1 個解決方案

解決方案1
1 已采納 2022-05-23 00:57:12

如何創建一個 for 循環來遍歷 R 中 glm 的多年組合？

問題描述

1 個解決方案

解決方案1 1 已采納 2022-05-23 00:57:12

解決方案1
1 已采納 2022-05-23 00:57:12