I have a data frame named “dat” with 10 numeric variables (var1, var2,var3,var4, var5,…var 10), each with several observations…
dat
var1 var2 var3 var4 var5 var6 var7 var8 var9 var10
1 12 5 18 19 12 17 11 16 18 10
2 3 2 10 6 13 17 11 16 18 10
3 13 15 14 13 1 17 11 16 18 10
4 17 11 16 18 10 17 11 16 18 10
5 9 13 8 8 7 17 11 16 18 10
6 15 6 20 17 3 17 11 16 18 10
7 12 5 18 19 12 17 11 16 18 10
8 3 2 10 6 13 17 11 16 18 10
9 13 15 14 13 1 17 11 16 18 10
...
I would like to write a code to repeat the same function for all the variables (except the first) in a data frame. The function should analyse the linear regression between var 1 and all the other variables (var2, var3, var4, var5) each at time, using the lm() function
eg cycle 1: linear regression between var 1 and var 2
lm(var1~var2, data=dat)
cycle 2: linear regression between var 1 and var 3,
lm(var1~var3, data=dat)
cycle 3: linear regression between var 1 and var 4
lm(var1~var4, data=dat)
and so on…
I would also like that the results from each cycle will be saved in a new data frame named “results”, having the following structure
Var_tested Correlation_coefficient P_value_correlation R_squared
Var2 corr_coeff_var2 p_value_var2 R_sq_var2
Var3 corr_coeff_var3 p_value_var3 R_sq_var3
Var4 corr_coeff_var4 p_value_var4 R_sq_var4
With each rows reporting data the results of each correlation. Is it possible?
Thank you so much for your help!
You can try the following code to have the desired output
data <- structure(list(var1 = c(12L, 3L, 13L, 17L, 9L, 15L, 12L, 3L,
13L), var2 = c(5L, 2L, 15L, 11L, 13L, 6L, 5L, 2L, 15L), var3 = c(18L,
10L, 14L, 16L, 8L, 20L, 18L, 10L, 14L), var4 = c(19L, 6L, 13L,
18L, 8L, 17L, 19L, 6L, 13L), var5 = c(12L, 13L, 1L, 10L, 7L,
3L, 12L, 13L, 1L), var6 = c(17L, 17L, 17L, 17L, 17L, 17L, 17L,
17L, 17L), var7 = c(11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L
), var8 = c(16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L), var9 = c(18L,
18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L), var10 = c(10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L)), class = "data.frame", row.names = c(NA,
-9L))
head(data,2)
#> var1 var2 var3 var4 var5 var6 var7 var8 var9 var10
#> 1 12 5 18 19 12 17 11 16 18 10
#> 2 3 2 10 6 13 17 11 16 18 10
x = names(data[,-1])
out <- unlist(lapply(1, function(n) combn(x, 1, FUN=function(row) paste0("var1 ~ ", paste0(row, collapse = "+")))))
out
#> [1] "var1 ~ var2" "var1 ~ var3" "var1 ~ var4" "var1 ~ var5"
#> [5] "var1 ~ var6" "var1 ~ var7" "var1 ~ var8" "var1 ~ var9"
#> [9] "var1 ~ var10"
library(broom)
#> Warning: package 'broom' was built under R version 3.5.3
library(dplyr)
#> Warning: package 'dplyr' was built under R version 3.5.3
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
#To have the regression coefficients
tmp1 = bind_rows(lapply(out, function(frml) {
a = tidy(lm(frml, data=data))
a$frml = frml
return(a)
}))
head(tmp1)
#> # A tibble: 6 x 6
#> term estimate std.error statistic p.value frml
#> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 (Intercept) 6.46 2.78 2.33 0.0529 var1 ~ var2
#> 2 var2 0.525 0.288 1.82 0.111 var1 ~ var2
#> 3 (Intercept) -1.50 4.47 -0.335 0.748 var1 ~ var3
#> 4 var3 0.863 0.303 2.85 0.0247 var1 ~ var3
#> 5 (Intercept) 0.649 2.60 0.250 0.810 var1 ~ var4
#> 6 var4 0.766 0.183 4.18 0.00413 var1 ~ var4
#To have the regression results i.e. R2, AIC, BIC
tmp2 = bind_rows(lapply(out, function(frml) {
a = glance(lm(frml, data=data))
a$frml = frml
return(a)
}))
head(tmp2)
#> # A tibble: 6 x 12
#> r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
#> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
#> 1 0.321 0.224 4.33 3.31 0.111 2 -24.8 55.7 56.3
#> 2 0.537 0.471 3.58 8.12 0.0247 2 -23.1 52.2 52.8
#> 3 0.714 0.673 2.81 17.5 0.00413 2 -20.9 47.9 48.5
#> 4 0.276 0.173 4.47 2.67 0.146 2 -25.1 56.2 56.8
#> 5 0 0 4.92 NA NA 1 -26.6 57.2 57.6
#> 6 0 0 4.92 NA NA 1 -26.6 57.2 57.6
#> # ... with 3 more variables: deviance <dbl>, df.residual <int>, frml <chr>
write.csv(tmp1, "Try_lm_coefficients.csv")
write.csv(tmp2, "Try_lm_results.csv")
Created on 2019-11-20 by the reprex package (v0.3.0)
dat <- structure(list(var1 = c(12L, 3L, 13L, 17L, 9L, 15L, 12L, 3L,
13L), var2 = c(5L, 2L, 15L, 11L, 13L, 6L, 5L, 2L, 15L), var3 = c(18L,
10L, 14L, 16L, 8L, 20L, 18L, 10L, 14L), var4 = c(19L, 6L, 13L,
18L, 8L, 17L, 19L, 6L, 13L), var5 = c(12L, 13L, 1L, 10L, 7L,
3L, 12L, 13L, 1L), var6 = c(17L, 17L, 17L, 17L, 17L, 17L, 17L,
17L, 17L), var7 = c(11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L
), var8 = c(16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L), var9 = c(18L,
18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L), var10 = c(10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9"))
We first write a function to obtain all the statistics you need. Note, rsq is the square of the correlation coefficient. So you don't need the linear model. The coefficient you get from the model is the slope.
STATS = function(x,y,DATA){
COR = cor.test(DATA[,y],DATA[,x])
MODEL = summary(lm(DATA[,y]~DATA[,x]))
data.frame(
VAR=x,
PEARSON_COR=as.numeric(COR$estimate),
PVAL=COR$p.value,
RSQ=as.numeric(COR$estimate^2),
SLOPE = MODEL$coefficients[2,1],
stringsAsFactors=FALSE
)
}
We test it on var2
STATS("var2","var1",dat)
VAR PEARSON_COR PVAL RSQ SLOPE
1 var2 0.5668721 0.1114741 0.321344 0.5251232
We do it for example on var2,var3,var4 and combine them into a data frame. Note I did not try var 6 to 10 because it's only 1 value
results = do.call(rbind,
lapply(c("var2","var3","var4"),function(i)STATS(i,"var1",dat)))
results
VAR PEARSON_COR PVAL RSQ SLOPE
1 var2 0.5668721 0.111474101 0.3213440 0.5251232
2 var3 0.7328421 0.024699805 0.5370575 0.8630573
3 var4 0.8450726 0.004127542 0.7141477 0.7660377
If you are familiar with tidyverse and purrr, you can do the following:
library(dplyr)
library(purrr)
c("var2","var3","var4") %>% map_dfr(STATS,"var1",dat)
There a several ways to do what you want in R. I suggest sapply
which is a simple way to apply a function other a list of variables. Here is an example to get the coefficients of each linear regression between var1 and all other variables.
# define a function to get coefficients from linear regression
do_lm <- function(var){ # var is the name of the column
res <- lm(as.formula(paste0("var1~",var)), data = dat) # compute linear regression
coefs <- c(intercept = res$coefficient[2], slope = res$coefficient[1]) # get coefficients
return(coefs)
}
t(
sapply(colnames(dat)[2:10], do_lm)
)
# t transposes the result
# sapply : applies on "var2" ... "var10" the function do_lm
It returns:
intercept.var2 slope.(Intercept)
var2 0.5251232 6.4600985
var3 0.8630573 -1.4968153
var4 0.7660377 0.6490566
var5 -0.5047619 14.8158730
var6 NA 10.7777778
var7 NA 10.7777778
var8 NA 10.7777778
var9 NA 10.7777778
var10 NA 10.7777778
You can adapt the function do_lm
in sapply
to compute other things, like correlations...
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.