Suppose I have a data set like this:
dat <- tibble(id = 1:4,
col1 = c(0, 1, 1, 0),
col2 = c(1, 0, 1, 0),
col3 = c(1, 1, 0, 1))
> dat
# A tibble: 4 × 4
id col1 col2 col3
<int> <dbl> <dbl> <dbl>
1 1 0 1 1
2 2 1 0 1
3 3 1 1 0
4 4 0 0 1
I'd like to separate, for every unique id, the multiple 1s into multiple rows, ie the expected output is:
# A tibble: 7 × 4
id col1 col2 col3
<dbl> <dbl> <dbl> <dbl>
1 1 0 1 0
2 1 0 0 1
3 2 1 0 0
4 2 0 0 1
5 3 1 0 0
6 3 0 1 0
7 4 0 0 1
For the first id (id = 1), col2 and col3 are both 1, so I would like a separate row for each of them. It kinda is like one-hot encoding for rows.
With help from Ritchie Sacramento and RobertoT
library(tidyverse)
dat <- tibble(id = 1:4,
col1 = c(0, 1, 1, 0),
col2 = c(1, 0, 1, 0),
col3 = c(1, 1, 0, 1))
dat %>%
pivot_longer(-id) %>%
filter(value != 0) %>%
mutate(rows = 1:nrow(.)) %>%
pivot_wider(values_fill = 0,
names_sort = TRUE) %>%
select(-rows)
# A tibble: 7 × 4
id col1 col2 col3
<int> <dbl> <dbl> <dbl>
1 1 0 1 0
2 1 0 0 1
3 2 1 0 0
4 2 0 0 1
5 3 1 0 0
6 3 0 1 0
7 4 0 0 1
Here is an alternative approach using model.matrix()
:
From the documenation: model.matrix
creates a design (or model) matrix, eg, by expanding factors to a set of dummy variables (depending on the contrasts) and expanding interactions similarly.
library(dplyr)
library(tidyr)
dat %>%
pivot_longer(-id) %>%
filter(value == 1) %>%
cbind((model.matrix(~ name + 0, .) == 1)*1)
id name value namecol1 namecol2 namecol3
1 1 col2 1 0 1 0
2 1 col3 1 0 0 1
3 2 col1 1 1 0 0
4 2 col3 1 0 0 1
5 3 col1 1 1 0 0
6 3 col2 1 0 1 0
7 4 col3 1 0 0 1
You could do
arrange(bind_rows(lapply(2:4, function(x) {
d <- dat[dat[[x]] == 1,]
d[-c(1, x)] <- 0
d})), id)
#> # A tibble: 7 x 4
#> id col1 col2 col3
#> <int> <dbl> <dbl> <dbl>
#> 1 1 0 1 0
#> 2 1 0 0 1
#> 3 2 1 0 0
#> 4 2 0 0 1
#> 5 3 1 0 0
#> 6 3 0 1 0
#> 7 4 0 0 1
Created on 2022-07-14 by the reprex package (v2.0.1)
Using explicit loops:
nullrow <- rep(0, ncol(dat)-1)
data <- dat[,-1]
rowsums <- apply(data, 1, sum)
res <- data[0,]
ids <- c()
for(i in 1:nrow(data)) {
if(rowsums[i]>0) {
for(j in 1:rowsums[i]) {
thisrow <- nullrow
thiscolumn <- which(data[i,]==1)[j]
thisrow[thiscolumn] <- 1
res <- rbind(res, thisrow)
}
ids <- c(ids, rep(dat$id[i], rowsums[i]))
}
}
names(res) <- colnames(data)
res$id <- ids
> res
col1 col2 col3 id
1 0 1 0 1
2 0 0 1 1
3 1 0 0 2
4 0 0 1 2
5 1 0 0 3
6 0 1 0 3
7 0 0 1 4
A possible solution, based on purrr:pmap_dfr
and on the following ideas:
Loop over all dataframe rows.
Use each row to create a diagonal matrix with the contents of the diagonal being the dataframe row.
Filter out the rows that only have zeros.
library(tidyverse)
pmap_dfr(dat, ~ data.frame(id = ..1, diag(c(...)[-1]))) %>%
filter(if_any(X1:X3, ~ .x != 0))
#> id X1 X2 X3
#> 1 1 0 1 0
#> 2 1 0 0 1
#> 3 2 1 0 0
#> 4 2 0 0 1
#> 5 3 1 0 0
#> 6 3 0 1 0
#> 7 4 0 0 1
Another possible solution, based on Matrix::sparseMatrix
:
which
).library(tidyverse)
library(Matrix)
which(dat[-1] == 1, arr.ind = T) %>%
as.data.frame %>%
arrange(row) %>%
mutate(id = dat[row,"id"], row = 1:n()) %>%
{data.frame(id = .$id, as.matrix( sparseMatrix(i = .$row, j= .$col, x= 1)))}
#> id X1 X2 X3
#> 1 1 0 1 0
#> 2 1 0 0 1
#> 3 2 1 0 0
#> 4 2 0 0 1
#> 5 3 1 0 0
#> 6 3 0 1 0
#> 7 4 0 0 1
Yet another possible solution:
library(tidyverse)
f <- function(df)
{
got <- 0
for (i in 1:nrow(df))
{
got <- which.max(df[i, (got+1):ncol(df)]) + got
df[i, -got] <- 0
}
df
}
dat %>%
slice(map(1:nrow(dat), ~ rep(.x, rowSums(dat[-1])[.x])) %>% unlist) %>%
group_by(id) %>%
group_modify(~ f(.)) %>%
ungroup
#> # A tibble: 7 × 4
#> id col1 col2 col3
#> <int> <dbl> <dbl> <dbl>
#> 1 1 0 1 0
#> 2 1 0 0 1
#> 3 2 1 0 0
#> 4 2 0 0 1
#> 5 3 1 0 0
#> 6 3 0 1 0
#> 7 4 0 0 1
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.