简体   繁体   中英

reshape dataframe efficiently with dplyr in r

I have a dataframe like this.

id = letters[1:5]
items = c('A,B,C,D,E',
          'C,D,E,A,B',
          'E,D,C',
          'B,A',
          'A')
dat = tibble(id = id, items =items)

> dat
# A tibble: 5 x 2
  id    items    
  <chr> <chr>    
1 a     A,B,C,D,E
2 b     C,D,E,A,B
3 c     E,D,C    
4 d     B,A      
5 e     A     

I want to split items into pieces and let other variables replace them (var A to var B ).

What the dat format I wanted to like this:

final.dat = tibble(
  id = id,
  A  = c(1, 1, 0, 1, 1),
  B  = c(1, 1, 0, 1, 0),
  C  = c(1, 1, 1, 0 ,0),
  D  = c(1, 1, 1, 0, 0),
  E  = c(1, 1, 1, 0, 0)
)

> final.dat
# A tibble: 5 x 6
  id        A     B     C     D     E
  <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 a         1     1     1     1     1
2 b         1     1     1     1     1
3 c         0     0     1     1     1
4 d         1     1     0     0     0
5 e         1     0     0     0     0

This is my code, but I thought it is redundant.

And also there is a BUG in my code: When I replace map(as_tibble) with map(as.data.frame) , the value is all NA .

Is there a more efficient way to do this?

Any help will be highly appreciated!

# get id
id = dat[,1]

# reshape items
items <- dat[,2]

# function that let the first row to colnames and then add a row that all value is 1. Finally, remove the first row
make.title <- function(data){
  row.1 <- unlist(slice(data, 1))
  colnames(data) <- row.1
  data <- rbind(data, rep(1, ncol(data)))
  data <- slice(data, -1)
  data
}

# final.dat.2 is what I wanted
final.dat.2 <-  
  split(items, seq(nrow(items))) %>% 
  map(unlist) %>% 
  map(~str_split(., pattern = ',')) %>% 
  map(unlist) %>% 
  map(rbind) %>% 
  map(as_tibble) %>% 
  map(make.title) %>% 
  bind_rows() %>% 
  transmute(across(.cols = everything(), ~replace_na(., 0))) %>% 
  bind_cols(id)

# bug occur
final.dat.3 <-  
  split(items, seq(nrow(items))) %>% 
  map(unlist) %>% 
  map(~str_split(., pattern = ',')) %>% 
  map(unlist) %>% 
  map(rbind) %>% 
  map(as.data.frame) %>%  # as dataframe
  map(make.title) %>% 
  bind_rows() %>% 
  transmute(across(.cols = everything(), ~replace_na(., 0))) %>% 
  bind_cols(id)

Try this. You can use separate_rows() and pivot_wider() from tidyverse to reach the expected output:

library(dplyr)
library(tidyr)
#Code
newdf <- dat %>% separate_rows(items,sep=',') %>%
  mutate(Val=1) %>%
  pivot_wider(names_from = items,values_from=Val,values_fill=0)

Output:

# A tibble: 5 x 6
  id        A     B     C     D     E
  <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 a         1     1     1     1     1
2 b         1     1     1     1     1
3 c         0     0     1     1     1
4 d         1     1     0     0     0
5 e         1     0     0     0     0

We can also do

library(dplyr)
library(tidyr)
df %>%
   mutate(items = strsplit(items, ",")) %>%
  unnest(c(items)) %>%
  mutate(Val = 1) %>%
  pivot_wider(names_from = items, values_from = Val, values_fill = 0)

Or an option with mtabulate

library(qdapTools)
cbind(dat['id'], mtabulate(strsplit(dat$items, ",")))

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM