multirow contingency table in R

Question

Let's consider this data set:

df <- data.frame(age=   sample(c(20:90), 20, rep=T), 
             sex =  sample(c('m', 'f'), 20, rep=T),
             smoker=sample(c("never", "former", "active"), 20, rep=T),
             size=  sample (c(8:40), 20, rep=T),
             fac =  as.factor(sample(c("neg","lo","med","hi"), 20, rep=T)),
             outcome = sample(c(0,1), 20, rep=T)
             )
# let's introduce some missing data         
for (i in (1:3)) {df[sample(c(1:20),1),  sample(c(1:6),1)]  <- NA}

In a medical manuscript the first table summarizes the population (or its subgroups as appropriate); here the rows would be age, sex, smoking status, etc and the two outcomes would be listed in separate columns. The continuous variables are reported as means; the categorical variables as counts.

I was wondering if there is a function that I am missing that creates such contingency tables. I can do that manually but would love to be able to automatically update if the data set changes. Ultimately I need to output in latex.
the function would need to ignore missing data, but not delete those rows.

Asking too much?!

Answer 1

In medical articles, 'Table 1' summarizes the demographics of the study population, usually broken down between subgroups

Generate data set

n <- 100
df <- data.frame(
age = sample(c(20:90), n, rep = T), 
sex = sample(c("m", "f"), 20, rep = T, prob = c(0.55, 0.45)), 
smoker = sample(c("never", "former", "active"), n, rep = T, prob = c(0.4, 0.45, 0.15)), 
size = abs(rnorm(n, 20, 8)), 
logitest = sample(c(TRUE, FALSE), n, rep = T, prob = c(0.1, 0.9)), 
labtest = as.factor(sample(c("neg", "lo", quot;med",quot;hi"), n, rep = T, prob = c(0.4, 0.3, 0.2, 0.1))), 
outcome = sample(c(0, 1), n, rep = T, prob = c(0.8, 0.2))
)

# let's introduce some missing data
for (i in (1:floor(n/6))) {
    df[sample(c(1:n), 1), sample(c(1:ncol(df)), 1)] <- NA
}
head(df)

##   age sex smoker  size logitest labtest outcome
## 1  70   m former 39.17       NA     med      NA
## 2  51   f former 33.64    FALSE      hi       1
## 3  58   f former 10.10    FALSE     neg       1
## 4  30   m former 43.24    FALSE     med       0
## 5  54   m former 22.78    FALSE      lo       0
## 6  86   f former  8.20    FALSE     neg       0

if working a real data set, use it instead

dx <- 7  #index of outcome/diagnosis
####################################
summary(df[, -dx])

Change this as needed: the column with the diagnosis has to be removed from the variables list!

attach(df)

 ## age sex smoker size logitest ## Min. :20.0 f :44 active:19 Min. : 0.91 Mode :logical ## 1st Qu.:42.5 m :54 former:49 1st Qu.:15.00 FALSE:85 ## Median :58.0 NA's: 2 never :30 Median :20.12 TRUE :12 ## Mean :57.3 NA's : 2 Mean :20.44 NA's :3 ## 3rd Qu.:74.0 3rd Qu.:27.10 ## Max. :88.0 Max. :43.24 ## NA's :1 NA's :2 ## labtest ## hi : 4 ## lo :29 ## med :20 ## neg :45 ## NA's: 2 ## ##

vars <- colnames(df)
vars

Build list of vars

catvars <- NULL  #categorical variables
contvars <- NULL  #continuous variables
logivars <- NULL  #logic variables

vars <- vars[-dx]
vars

## [1] "age"      "sex"      "smoker"   "size"     "logitest" "labtest"

for (i in 1:length(vars)) {
    ifelse(is.factor(df[, i]), catvars <- c(catvars, vars[i]), ifelse(is.logical(df[, 
        i]), logivars <- c(logivars, vars[i]), contvars <- c(contvars, vars[i])))
}
contvars

## [1] "age"  "size"

catvars

## [1] "sex"     "smoker"  "labtest"

logivars

## [1] "logitest"

 logivars \n

bg <- df[df[, dx] == 0 & !is.na(df[, dx]), ]
nrow(bg)  #; bg

Create the subgroups

mg <- df[df[, dx] == 1 & !is.na(df[, dx]), ]
nrow(mg)  #; mg

## [1] 23

indet <- df[is.na(df[, dx]), ]
nrow(indet)

## [1] 4

indet

##    age sex smoker   size logitest labtest outcome
## 1   70   m former 39.173       NA     med      NA
## 9   87   m former 23.621    FALSE      lo      NA
## 18  65   m former  2.466    FALSE    <NA>      NA
## 67  88   f former 17.575    FALSE     med      NA

 indet \n

normality <- NULL
for (i in 1:length(contvars)) {
    j <- which(vars == contvars[i])  #find position of variable in the original data frame and its subsets
    st <- shapiro.test(df[, j])  #normality testing on all patients, bg and mg alike
    normality <- c(normality, st$p.value)  #normality testing on all patients, bg and mg alike
}
normality

For continuous variables

normality testing

contvarlist <- list(variables = contvars, normality = normality, ttest.by.subgroup = ttpvalue)

 ## [1] 0.00125 0.73602

comparing the means of two samples; if normal, use t-test, otherwise wilcoxon

proppvalue <- NULL
for (i in 1:length(logivars)) {
    j <- which(vars == logivars[i])  #find position of variable in the original data frame and its subsets
    tbl <- table(df[, j], df[, dx])
    chisqtest <- summary(tbl)
    proppvalue <- c(proppvalue, chisqtest$p.value)
}
proppvalue

## [1] 0.5551

logivarlist = list(variables = logivars, chisq.by.subgroup = proppvalue)

For categorical variables

str(contvarlist)  #if shapiro p<.05 then pop likely NOT normally dist; if t-test p<.05 then pop likely have different means

## List of 3
##  $ variables        : chr [1:2] "age" "size"
##  $ normality        : num [1:2] 0.00125 0.73602
##  $ ttest.by.subgroup: num [1:2] 0.636 0.367

str(catvarlist)  #if chisq p<.05 then variables are likely NOT independent

For logic variables

str(logivarlist)  #if chisq p<.05 then variables are likely NOT independent

## List of 2
##  $ variables        : chr "logitest"
##  $ chisq.by.subgroup: num 0.555

 logivarlist = list(variables = logivars, chisq.by.subgroup = proppvalue) \n

And now, the results!

 str(contvarlist) #if shapiro p<.05 then pop likely NOT normally dist;  if t-test p<.05 then pop likely have different means \n

 ## List of 3 ## $ variables : chr [1:2] "age" "size" ## $ normality : num [1:2] 0.00125 0.73602 ## $ ttest.by.subgroup: num [1:2] 0.636 0.367

 str(catvarlist) #if chisq p<.05 then variables are likely NOT independent \n

 ## List of 2 ## $ variables : chr [1:3] "sex" "smoker" "labtest" ## $ chisq.by.subgroup: num [1:3] 0.0158 0.7712 0.3948

 str(logivarlist) #if chisq p<.05 then variables are likely NOT independent \n

 ## List of 2 ## $ variables : chr "logitest" ## $ chisq.by.subgroup: num 0.555

multirow contingency table in R

Question

1 answers

solution1
0 2013-08-14 02:58:15

Generate data set

Change this as needed: the column with the diagnosis has to be removed from the variables list!

Build list of vars

Create the subgroups

For continuous variables

For categorical variables

For logic variables

And now, the results!

multirow contingency table in R

Question

1 answers

solution1 0 2013-08-14 02:58:15

Generate data set

Change this as needed: the column with the diagnosis has to be removed from the variables list!

Build list of vars

Create the subgroups

For continuous variables

For categorical variables

For logic variables

And now, the results!

solution1
0 2013-08-14 02:58:15