Let's consider this data set:
df <- data.frame(age= sample(c(20:90), 20, rep=T),
sex = sample(c('m', 'f'), 20, rep=T),
smoker=sample(c("never", "former", "active"), 20, rep=T),
size= sample (c(8:40), 20, rep=T),
fac = as.factor(sample(c("neg","lo","med","hi"), 20, rep=T)),
outcome = sample(c(0,1), 20, rep=T)
)
# let's introduce some missing data
for (i in (1:3)) {df[sample(c(1:20),1), sample(c(1:6),1)] <- NA}
In a medical manuscript the first table summarizes the population (or its subgroups as appropriate); here the rows would be age, sex, smoking status, etc and the two outcomes would be listed in separate columns. The continuous variables are reported as means; the categorical variables as counts.
Asking too much?!
In medical articles, 'Table 1' summarizes the demographics of the study population, usually broken down between subgroups
n <- 100 df <- data.frame( age = sample(c(20:90), n, rep = T), sex = sample(c("m", "f"), 20, rep = T, prob = c(0.55, 0.45)), smoker = sample(c("never", "former", "active"), n, rep = T, prob = c(0.4, 0.45, 0.15)), size = abs(rnorm(n, 20, 8)), logitest = sample(c(TRUE, FALSE), n, rep = T, prob = c(0.1, 0.9)), labtest = as.factor(sample(c("neg", "lo", quot;med",quot;hi"), n, rep = T, prob = c(0.4, 0.3, 0.2, 0.1))), outcome = sample(c(0, 1), n, rep = T, prob = c(0.8, 0.2)) ) # let's introduce some missing data for (i in (1:floor(n/6))) { df[sample(c(1:n), 1), sample(c(1:ncol(df)), 1)] <- NA } head(df)
## age sex smoker size logitest labtest outcome
## 1 70 m former 39.17 NA med NA
## 2 51 f former 33.64 FALSE hi 1
## 3 58 f former 10.10 FALSE neg 1
## 4 30 m former 43.24 FALSE med 0
## 5 54 m former 22.78 FALSE lo 0
## 6 86 f former 8.20 FALSE neg 0
if working a real data set, use it instead
dx <- 7 #index of outcome/diagnosis #################################### summary(df[, -dx])
attach(df)
## age sex smoker size logitest ## Min. :20.0 f :44 active:19 Min. : 0.91 Mode :logical ## 1st Qu.:42.5 m :54 former:49 1st Qu.:15.00 FALSE:85 ## Median :58.0 NA's: 2 never :30 Median :20.12 TRUE :12 ## Mean :57.3 NA's : 2 Mean :20.44 NA's :3 ## 3rd Qu.:74.0 3rd Qu.:27.10 ## Max. :88.0 Max. :43.24 ## NA's :1 NA's :2 ## labtest ## hi : 4 ## lo :29 ## med :20 ## neg :45 ## NA's: 2 ## ##
vars <- colnames(df) vars
catvars <- NULL #categorical variables contvars <- NULL #continuous variables logivars <- NULL #logic variables vars <- vars[-dx] vars
## [1] "age" "sex" "smoker" "size" "logitest" "labtest"
for (i in 1:length(vars)) { ifelse(is.factor(df[, i]), catvars <- c(catvars, vars[i]), ifelse(is.logical(df[, i]), logivars <- c(logivars, vars[i]), contvars <- c(contvars, vars[i]))) } contvars
## [1] "age" "size"
catvars
## [1] "sex" "smoker" "labtest"
logivars
## [1] "logitest"
logivars \n
bg <- df[df[, dx] == 0 & !is.na(df[, dx]), ] nrow(bg) #; bg
mg <- df[df[, dx] == 1 & !is.na(df[, dx]), ] nrow(mg) #; mg
## [1] 23
indet <- df[is.na(df[, dx]), ] nrow(indet)
## [1] 4
indet
## age sex smoker size logitest labtest outcome
## 1 70 m former 39.173 NA med NA
## 9 87 m former 23.621 FALSE lo NA
## 18 65 m former 2.466 FALSE <NA> NA
## 67 88 f former 17.575 FALSE med NA
indet \n
normality <- NULL for (i in 1:length(contvars)) { j <- which(vars == contvars[i]) #find position of variable in the original data frame and its subsets st <- shapiro.test(df[, j]) #normality testing on all patients, bg and mg alike normality <- c(normality, st$p.value) #normality testing on all patients, bg and mg alike } normality
contvarlist <- list(variables = contvars, normality = normality, ttest.by.subgroup = ttpvalue)
## [1] 0.00125 0.73602
comparing the means of two samples;
if normal, use t-test, otherwise wilcoxon
proppvalue <- NULL for (i in 1:length(logivars)) { j <- which(vars == logivars[i]) #find position of variable in the original data frame and its subsets tbl <- table(df[, j], df[, dx]) chisqtest <- summary(tbl) proppvalue <- c(proppvalue, chisqtest$p.value) } proppvalue
## [1] 0.5551
logivarlist = list(variables = logivars, chisq.by.subgroup = proppvalue)
str(contvarlist) #if shapiro p<.05 then pop likely NOT normally dist; if t-test p<.05 then pop likely have different means
## List of 3
## $ variables : chr [1:2] "age" "size"
## $ normality : num [1:2] 0.00125 0.73602
## $ ttest.by.subgroup: num [1:2] 0.636 0.367
str(catvarlist) #if chisq p<.05 then variables are likely NOT independent
str(logivarlist) #if chisq p<.05 then variables are likely NOT independent
## List of 2
## $ variables : chr "logitest"
## $ chisq.by.subgroup: num 0.555
logivarlist = list(variables = logivars, chisq.by.subgroup = proppvalue) \n
str(contvarlist) #if shapiro p<.05 then pop likely NOT normally dist; if t-test p<.05 then pop likely have different means \n
## List of 3 ## $ variables : chr [1:2] "age" "size" ## $ normality : num [1:2] 0.00125 0.73602 ## $ ttest.by.subgroup: num [1:2] 0.636 0.367
str(catvarlist) #if chisq p<.05 then variables are likely NOT independent \n
## List of 2 ## $ variables : chr [1:3] "sex" "smoker" "labtest" ## $ chisq.by.subgroup: num [1:3] 0.0158 0.7712 0.3948
str(logivarlist) #if chisq p<.05 then variables are likely NOT independent \n
## List of 2 ## $ variables : chr "logitest" ## $ chisq.by.subgroup: num 0.555
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.