简体   繁体   中英

How to create design matrix in r

I have two factors. factor A have 2 level, factor B have 3 level.

How to create the following design matrix?

     factorA1 factorA2 factorB1 factorB2 factorB3
[1,]        1        0        1        0        0
[2,]        1        0        0        1        0
[3,]        1        0        0        0        1
[4,]        0        1        1        0        0
[5,]        0        1        0        1        0
[6,]        0        1        0        0        1

You have a couple of options:

Use base and piece it together yourself:

(iris.dummy<-with(iris,model.matrix(~Species-1))) 
(IRIS<-data.frame(iris,iris.dummy)) 

Or use the ade4 package as follows:

dummy <- function(df) {
    require(ade4)
    ISFACT <- sapply(df, is.factor)
    FACTS <- acm.disjonctif(df[, ISFACT, drop = FALSE])
    NONFACTS <- df[, !ISFACT,drop = FALSE]
    data.frame(NONFACTS, FACTS)
}

dat <-data.frame(eggs = c("foo", "foo", "bar", "bar"),  
    ham = c("red","blue","green","red"), x=rnorm(4)) 
dummy(dat)



##           x eggs.bar eggs.foo ham.blue ham.green ham.red
## 1 0.3365302        0        1        0         0       1
## 2 1.1341354        0        1        1         0       0
## 3 2.0489741        1        0        0         1       0
## 4 1.1019108        1        0        0         0       1

Assuming your data in in a data.frame called dat , let's say the two factors are given as in this example:

> dat <- data.frame(f1=sample(LETTERS[1:3],20,T),f2=sample(LETTERS[4:5],20,T),id=1:20)
> dat
   f1 f2 id
1   C  D  1
2   B  E  2
3   B  E  3
4   A  D  4
5   C  E  5
6   C  E  6
7   C  D  7
8   B  E  8
9   C  D  9
10  A  D 10
11  B  E 11
12  C  E 12
13  B  D 13
14  B  E 14
15  A  D 15
16  C  E 16
17  C  D 17
18  C  D 18
19  B  D 19
20  C  D 20
> dat$f1
 [1] C B B A C C C B C A B C B B A C C C B C
Levels: A B C
> dat$f2
 [1] D E E D E E D E D D E E D E D E D D D D
Levels: D E

You can use outer to get a matrix as you showed, for each factor:

> F1 <- with(dat, outer(f1, levels(f1), `==`)*1)
> colnames(F1) <- paste("f1",sep="=",levels(dat$f1))
> F1
      f1=A f1=B f1=C
 [1,]    0    0    1
 [2,]    0    1    0
 [3,]    0    1    0
 [4,]    1    0    0
 [5,]    0    0    1
 [6,]    0    0    1
 [7,]    0    0    1
 [8,]    0    1    0
 [9,]    0    0    1
[10,]    1    0    0
[11,]    0    1    0
[12,]    0    0    1
[13,]    0    1    0
[14,]    0    1    0
[15,]    1    0    0
[16,]    0    0    1
[17,]    0    0    1
[18,]    0    0    1
[19,]    0    1    0
[20,]    0    0    1

Now do the same for the second factor:

> F2 <- with(dat, outer(f2, levels(f2), `==`)*1)
> colnames(F2) <- paste("f2",sep="=",levels(dat$f2))

And cbind them to get the final result:

> cbind(F1,F2)

model.matrix is the process that lm and others use in the background to convert for you.

dat <- data.frame(f1=sample(LETTERS[1:3],20,T),f2=sample(LETTERS[4:5],20,T),id=1:20)
dat

model.matrix(~dat$f1 + dat$f2)

It creates the INTERCEPT variable as a column of 1's, but you can easily remove that if you need.

model.matrix(~dat$f1 + dat$f2)[,-1]

Edit: Now i see that this is essentially the same as one of the other comments, but more concise.

Expanding and generalizing @Ferdinand.kraft's answer:

dat <- data.frame(
    f1 = sample(LETTERS[1:3], 20, TRUE),
    f2 = sample(LETTERS[4:5], 20, TRUE),
    row.names = paste0("id_", 1:20))

covariates <- c("f1", "f2") # in case you have other columns that you don't want to include in the design matrix
design <- do.call(cbind, lapply(covariates, function(covariate){
    apply(outer(dat[[covariate]], unique(dat[[covariate]]), FUN = "=="), 2, as.integer)
}))
rownames(design) <- rownames(dat)
colnames(design) <- unlist(sapply(covariates, function(covariate) unique(dat[[covariate]])))
design <- design[, !duplicated(colnames(design))] # duplicated colnames happen sometimes
design
#       C A B D E
# id_1  1 0 0 1 0
# id_2  0 1 0 1 0
# id_3  0 0 1 1 0
# id_4  1 0 0 1 0
# id_5  0 1 0 1 0
# id_6  0 1 0 0 1
# id_7  0 0 1 0 1

Model matrix only allows what it calls "dummy" coding for the first factor in a formula. If the intercept is present, it plays that role. To get the desired effect of a redundant index matrix (where you have a 1 in every column for the corresponding factor level and 0 elsewhere), you can lie to model.matrix() and pretend there's an extra level. Then trim off the intercept column.


> a=rep(1:2,3)
> b=rep(1:3,2)
> df=data.frame(A=a,B=b)
> # Lie and pretend there's a level 0 in each factor.
> df$A=factor(a,as.character(0:2))

> df$B=factor(b,as.character(0:3))

> mm=model.matrix (~A+B,df)

> mm
  (Intercept) A1 A2 B1 B2 B3
1           1  1  0  1  0  0
2           1  0  1  0  1  0
3           1  1  0  0  0  1
4           1  0  1  1  0  0
5           1  1  0  0  1  0
6           1  0  1  0  0  1
attr(,"assign")
[1] 0 1 1 2 2 2
attr(,"contrasts")
attr(,"contrasts")$A
[1] "contr.treatment"
attr(,"contrasts")$B
[1] "contr.treatment"

> # mm has an intercept column not requested, so kill it
> dm=as.matrix(mm[,-1])
> dm
  A1 A2 B1 B2 B3
1  1  0  1  0  0
2  0  1  0  1  0
3  1  0  0  0  1
4  0  1  1  0  0
5  1  0  0  1  0
6  0  1  0  0  1

> # You can also add interactions
> mm2=model.matrix (~A*B,df)
> dm2=as.matrix(mm2[,-1])
> dm2
  A1 A2 B1 B2 B3 A1:B1 A2:B1 A1:B2 A2:B2 A1:B3 A2:B3
1  1  0  1  0  0     1     0     0     0     0     0
2  0  1  0  1  0     0     0     0     1     0     0
3  1  0  0  0  1     0     0     0     0     1     0
4  0  1  1  0  0     0     1     0     0     0     0
5  1  0  0  1  0     0     0     1     0     0     0
6  0  1  0  0  1     0     0     0     0     0     1

Things get complicated with model.matrix() again if we add a covariate x and interactions of x with factors.

a=rep(1:2,3)
b=rep(1:3,2)
x=1:6
df=data.frame(A=a,B=b,x=x)
# Lie and pretend there's a level 0 in each factor.
df$A=factor(a,as.character(0:2))

df$B=factor(b,as.character(0:3))

mm=model.matrix (~A + B + A:x + B:x,df)

print(mm)

  (Intercept) A1 A2 B1 B2 B3 A0:x A1:x A2:x B1:x B2:x B3:x
1           1  1  0  1  0  0    0    1    0    1    0    0
2           1  0  1  0  1  0    0    0    2    0    2    0
3           1  1  0  0  0  1    0    3    0    0    0    3
4           1  0  1  1  0  0    0    0    4    4    0    0
5           1  1  0  0  1  0    0    5    0    0    5    0
6           1  0  1  0  0  1    0    0    6    0    0    6

So mm has an intercept, but now A:x interaction terms have an unwanted level A0:x If we reintroduce x as as a separate term, we will cancel that unwanted level

mm2=model.matrix (~ x + A +   B + A:x + B:x, df)
print(mm2)
  (Intercept) x A1 A2 B1 B2 B3 x:A1 x:A2 x:B1 x:B2 x:B3
1           1 1  1  0  1  0  0    1    0    1    0    0
2           1 2  0  1  0  1  0    0    2    0    2    0
3           1 3  1  0  0  0  1    3    0    0    0    3
4           1 4  0  1  1  0  0    0    4    4    0    0
5           1 5  1  0  0  1  0    5    0    0    5    0
6           1 6  0  1  0  0  1    0    6    0    0    6

We can get rid of the unwanted intercept and the unwanted bare x term

dm2=as.matrix(mm2[,c(-1,-2)])
print(dm2)

  A1 A2 B1 B2 B3 x:A1 x:A2 x:B1 x:B2 x:B3
1  1  0  1  0  0    1    0    1    0    0
2  0  1  0  1  0    0    2    0    2    0
3  1  0  0  0  1    3    0    0    0    3
4  0  1  1  0  0    0    4    4    0    0
5  1  0  0  1  0    5    0    0    5    0
6  0  1  0  0  1    0    6    0    0    6

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM