簡體   English   中英

如何計算 R 中數值變量和分類變量的描述性統計量?

[英]How to calculates the descriptive statistics for both numeric and categorical variables in R?

我正在嘗試編寫一個 function 來計算數字和分類變量(因子)的描述性統計。 對於數值型變量,計算均值(MEAN)、中位數(MEDIAN)、標准差(SD),計算缺失值個數(NMiss)。 對於字符變量,應將變量各級別內的計數制表並統計缺失值的個數。

起始輸入數據為:

   ID GLUC TGL HDL LDL  HRT MAMM SMOKE
1   A   88  NA  32  99    Y <NA>  ever
2   B   NA 150  60  NA <NA>   no never
3   C  110  NA  NA 120    N <NA>  <NA>
4   D   NA 200  65 165 <NA>  yes never

我希望它看起來像這樣:

> table1 (dat=patient, numvar=c("TGL", "HDL", "LDL"), charvar=c("HRT", "MAMM"))
$numericStats
 varName MEAN   MEDIAN SD      NMiss
1 TGL 180.66667 180.0 23.03620 4
2 HDL 55.66667  62.5  19.00175 4
3 LDL 160.28571 165.0 40.06126 3
$FactorStats
  varName group  count
1   HRT   N       2
2         Y       3
3         NMiss   5
4   MAMM  no      2
5         yes     4
6        NMiss    4

這是我到目前為止的代碼:

#numericstats
    findnum = function(dat, numvar){
      numstats=data.frame()
      for (i in length(numvar[])){
        var_select = dat[[numvar[i]]]
        mean_value = round(mean(var_select, na.rm=T),2)
        median_value = round(median(var_select, na.rm=T),2)
        SD = round(sd(var_select, na.rm=T),2)
        N = length(var_select[!is.na(var_select)])
        N_miss = length(var_select[is.na(var_select)])
        numstats = 
          cbind(varname = numvar, mean = mean_value, median = median_value, sd = SD, nmissing = N_miss) 
      } 
      return(numstats)
    }
    findnum(dat=patient, numvar=c("TGL","HDL","LDL"))
    
    #factorstats
    findfactor = function(dat, charvar){
      factstats=data.frame()
      for (i in length(charvar[])){
        var_select = dat[[charvar[i]]]
        count = length(charvar)
        group = charvar
        factstats = 
          cbind(varname = charvar, group = charvar, count = count) 
      } 
      return(factstats)
    }
    findfactor(dat=patient, charvar=c("MAMM","SMOKE"))
    
    #full function
    table1 = function(dat, numvar, charvar){
      for (i in 1:length(dat)){
        if (!is.numeric(i))
          numericstats = findnum(dat, i)
        else factorstats = findfactor(dat, i)
        return(data.frame(numericstats, factorstats))
      }
    }

這是使用lapply的一種方法:

table1 <- function(df, numvar, charvar) {
  list(numericStats = cbind(VarName = numvar,do.call(rbind, 
        lapply(df[numvar], function(x) {
    data.frame(MEAN = mean(x, na.rm = TRUE), MEDIAN = median(x, na.rm = TRUE), 
              SD = sd(x, na.rm = TRUE), NMiss = sum(!is.na(x)))
  }))), 
  FactorStats = do.call(rbind, lapply(charvar, function(x) {
    tab <- stack(c(table(df[[x]]), Nmiss = sum(is.na(df[[x]]))))[2:1]
    names(tab) <- c('group', 'count')
    cbind(Varname = x, tab)
  })))
}

table1(patient, numvar=c("TGL", "HDL", "LDL"), charvar=c("HRT", "MAMM"))

#$numericStats
#    VarName  MEAN MEDIAN   SD NMiss
#TGL     TGL 175.0    175 35.4     2
#HDL     HDL  52.3     60 17.8     3
#LDL     LDL 128.0    120 33.7     3

#$FactorStats
#  Varname group count
#1     HRT     N     1
#2     HRT     Y     1
#3     HRT Nmiss     2
#4    MAMM    no     1
#5    MAMM   yes     1
#6    MAMM Nmiss     2

數據

patient <- structure(list(ID = c("A", "B", "C", "D"), GLUC = c(88L, NA, 
110L, NA), TGL = c(NA, 150L, NA, 200L), HDL = c(32L, 60L, NA, 
65L), LDL = c(99L, NA, 120L, 165L), HRT = c("Y", NA, "N", NA), 
    MAMM = c(NA, "no", NA, "yes"), SMOKE = c("ever", "never", 
    NA, "never")), row.names = c(NA, -4L), class = "data.frame")

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM