简体   繁体   中英

multirow contingency table in R

Let's consider this data set:

df <- data.frame(age=   sample(c(20:90), 20, rep=T), 
             sex =  sample(c('m', 'f'), 20, rep=T),
             smoker=sample(c("never", "former", "active"), 20, rep=T),
             size=  sample (c(8:40), 20, rep=T),
             fac =  as.factor(sample(c("neg","lo","med","hi"), 20, rep=T)),
             outcome = sample(c(0,1), 20, rep=T)
             )
# let's introduce some missing data         
for (i in (1:3)) {df[sample(c(1:20),1),  sample(c(1:6),1)]  <- NA}

In a medical manuscript the first table summarizes the population (or its subgroups as appropriate); here the rows would be age, sex, smoking status, etc and the two outcomes would be listed in separate columns. The continuous variables are reported as means; the categorical variables as counts.

  1. I was wondering if there is a function that I am missing that creates such contingency tables. I can do that manually but would love to be able to automatically update if the data set changes. Ultimately I need to output in latex.
  2. the function would need to ignore missing data, but not delete those rows.

Asking too much?!

In medical articles, 'Table 1' summarizes the demographics of the study population, usually broken down between subgroups

Generate data set

n <- 100
df <- data.frame(
age = sample(c(20:90), n, rep = T), 
sex = sample(c("m", "f"), 20, rep = T, prob = c(0.55, 0.45)), 
smoker = sample(c("never", "former", "active"), n, rep = T, prob = c(0.4, 0.45, 0.15)), 
size = abs(rnorm(n, 20, 8)), 
logitest = sample(c(TRUE, FALSE), n, rep = T, prob = c(0.1, 0.9)), 
labtest = as.factor(sample(c("neg", "lo", quot;med",quot;hi"), n, rep = T, prob = c(0.4, 0.3, 0.2, 0.1))), 
outcome = sample(c(0, 1), n, rep = T, prob = c(0.8, 0.2))
)

# let's introduce some missing data
for (i in (1:floor(n/6))) {
    df[sample(c(1:n), 1), sample(c(1:ncol(df)), 1)] <- NA
}
head(df)
##   age sex smoker  size logitest labtest outcome
## 1  70   m former 39.17       NA     med      NA
## 2  51   f former 33.64    FALSE      hi       1
## 3  58   f former 10.10    FALSE     neg       1
## 4  30   m former 43.24    FALSE     med       0
## 5  54   m former 22.78    FALSE      lo       0
## 6  86   f former  8.20    FALSE     neg       0
if working a real data set, use it instead
dx <- 7  #index of outcome/diagnosis
####################################
summary(df[, -dx])

Change this as needed: the column with the diagnosis has to be removed from the variables list!

attach(df)
 ## age sex smoker size logitest ## Min. :20.0 f :44 active:19 Min. : 0.91 Mode :logical ## 1st Qu.:42.5 m :54 former:49 1st Qu.:15.00 FALSE:85 ## Median :58.0 NA's: 2 never :30 Median :20.12 TRUE :12 ## Mean :57.3 NA's : 2 Mean :20.44 NA's :3 ## 3rd Qu.:74.0 3rd Qu.:27.10 ## Max. :88.0 Max. :43.24 ## NA's :1 NA's :2 ## labtest ## hi : 4 ## lo :29 ## med :20 ## neg :45 ## NA's: 2 ## ## 
vars <- colnames(df)
vars

Build list of vars

catvars <- NULL  #categorical variables
contvars <- NULL  #continuous variables
logivars <- NULL  #logic variables

vars <- vars[-dx]
vars
## [1] "age"      "sex"      "smoker"   "size"     "logitest" "labtest"
for (i in 1:length(vars)) {
    ifelse(is.factor(df[, i]), catvars <- c(catvars, vars[i]), ifelse(is.logical(df[, 
        i]), logivars <- c(logivars, vars[i]), contvars <- c(contvars, vars[i])))
}
contvars
## [1] "age"  "size"
catvars
## [1] "sex"     "smoker"  "labtest"
logivars
## [1] "logitest"
 logivars \n
bg <- df[df[, dx] == 0 & !is.na(df[, dx]), ]
nrow(bg)  #; bg

Create the subgroups

mg <- df[df[, dx] == 1 & !is.na(df[, dx]), ]
nrow(mg)  #; mg
## [1] 23
indet <- df[is.na(df[, dx]), ]
nrow(indet)
## [1] 4
indet
##    age sex smoker   size logitest labtest outcome
## 1   70   m former 39.173       NA     med      NA
## 9   87   m former 23.621    FALSE      lo      NA
## 18  65   m former  2.466    FALSE    <NA>      NA
## 67  88   f former 17.575    FALSE     med      NA
 indet \n
normality <- NULL
for (i in 1:length(contvars)) {
    j <- which(vars == contvars[i])  #find position of variable in the original data frame and its subsets
    st <- shapiro.test(df[, j])  #normality testing on all patients, bg and mg alike
    normality <- c(normality, st$p.value)  #normality testing on all patients, bg and mg alike
}
normality

For continuous variables

normality testing
contvarlist <- list(variables = contvars, normality = normality, ttest.by.subgroup = ttpvalue)
 ## [1] 0.00125 0.73602 
comparing the means of two samples; if normal, use t-test, otherwise wilcoxon
proppvalue <- NULL
for (i in 1:length(logivars)) {
    j <- which(vars == logivars[i])  #find position of variable in the original data frame and its subsets
    tbl <- table(df[, j], df[, dx])
    chisqtest <- summary(tbl)
    proppvalue <- c(proppvalue, chisqtest$p.value)
}
proppvalue
## [1] 0.5551
logivarlist = list(variables = logivars, chisq.by.subgroup = proppvalue)

For categorical variables

str(contvarlist)  #if shapiro p<.05 then pop likely NOT normally dist; if t-test p<.05 then pop likely have different means
## List of 3
##  $ variables        : chr [1:2] "age" "size"
##  $ normality        : num [1:2] 0.00125 0.73602
##  $ ttest.by.subgroup: num [1:2] 0.636 0.367
str(catvarlist)  #if chisq p<.05 then variables are likely NOT independent

For logic variables

str(logivarlist)  #if chisq p<.05 then variables are likely NOT independent
## List of 2
##  $ variables        : chr "logitest"
##  $ chisq.by.subgroup: num 0.555
 logivarlist = list(variables = logivars, chisq.by.subgroup = proppvalue) \n

And now, the results!

 str(contvarlist) #if shapiro p<.05 then pop likely NOT normally dist;  if t-test p<.05 then pop likely have different means \n
 ## List of 3 ## $ variables : chr [1:2] "age" "size" ## $ normality : num [1:2] 0.00125 0.73602 ## $ ttest.by.subgroup: num [1:2] 0.636 0.367 
 str(catvarlist) #if chisq p<.05 then variables are likely NOT independent \n
 ## List of 2 ## $ variables : chr [1:3] "sex" "smoker" "labtest" ## $ chisq.by.subgroup: num [1:3] 0.0158 0.7712 0.3948 
 str(logivarlist) #if chisq p<.05 then variables are likely NOT independent \n
 ## List of 2 ## $ variables : chr "logitest" ## $ chisq.by.subgroup: num 0.555 

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM