简体   繁体   中英

R: Reorganizing rows of each dataframe within a list into a new list of new dataframes

Edited to add more details and clarify. Basically, I have a list of data frames, they have the same row numbers but various column numbers, so the dimension of each data frame is different . What I want to do now is to select the first row of each data frame, put them into a new data frame and use it as the first element of a new list, then do the same things for second rows, third rows...

I have contemplated to use 2 for loops to reassign rows, however that seems to be a very bad way to go about it given that nested for loop is pretty slow and the data I have is huge. Would truly appreciate sound insight and help.

myList <- list()
df1 <- as.data.frame(matrix(1:6, nrow=3, ncol=2))
df2 <- as.data.frame(matrix(7:15, nrow=3, ncol=3))
myList[[1]]<-df1
myList[[2]]<-df2
print(myList)

Current example data -

> print(myList)
[[1]]
  V1 V2
1  1  4
2  2  5
3  3  6

[[2]]
  V1 V2 V3
1  7 10 13
2  8 11 14
3  9 12 15

Desired Outcome

> print(myList2)
[[1]]
  V1 V2 V3
1  1  4  0
2  7 10 13

[[2]]
  V1 V2 V3
1  2  5  0
2  8 11 14

[[3]]
  V1 V2 V3
1  3  6  0
2  9 12 15

The different dimensions of the current data frames makes it tricky.

Here's a base method of:

  1. Adding all the column names to each list item
  2. Converting the list to an array.
  3. Transposing the array using aperm to match your intended output
  4. Optional turning the array into a list using apply .
myListBase <- myList #added because we modify the original list

#get all of the unique names from the list of dataframes
##default ordering is by ordering in list
all_cols <- Reduce(base::union, lapply(myListBase, names))

#loop, add new columns, and then re-order them so all data.frames
# have the same order
myListBase <- lapply(myListBase,
                     function(DF){
                       DF[, base::setdiff(all_cols, names(DF))] <- 0 #initialze columns
                       DF[, all_cols] #reorder columns
                       }
                     )

#create 3D array - could be simplified using abind::abind(abind(myListBase, along = 3))
myArrayBase <- array(unlist(myListBase, use.names = F),
                     dim = c(nrow(myListBase[[1]]), #rows
                             length(all_cols), #columns
                             length(myListBase) #3rd dimension
                             ),
                     dimnames = list(NULL, all_cols, NULL))

#rows and 3rd dimension are transposed
myPermBase <- aperm(myArrayBase, c(3,2,1))
myPermBase

#, , 1
#
#     V1 V2 V3
#[1,]  1  4  0
#[2,]  7 10 13
#
#, , 2
#
#     V1 V2 V3
#[1,]  2  5  0
#[2,]  8 11 14
#
#, , 3
#
#     V1 V2 V3
#[1,]  3  6  0
#[2,]  9 12 15

#make list of dataframes - likely not necessary
apply(myPermBase, 3, data.frame)

#[[1]]
#  V1 V2 V3
#1  1  4  0
#2  7 10 13
#
#[[2]]
#  V1 V2 V3
#1  2  5  0
#2  8 11 14
#
#[[3]]
#  V1 V2 V3
#1  3  6  0
#2  9 12 15

Performance

The first version of the answer had a data.table and abind method but I've removed it - the base version is much faster and there's not much additional clarity gained.

Unit: microseconds
                expr    min      lq     mean  median      uq     max neval
 camille_purrr_dplyr 7910.9 8139.25 8614.956 8246.30 8387.20 60159.5  1000
       cole_DT_abind 2555.8 2804.75 3012.671 2917.95 3061.55  6602.3  1000
           cole_base  600.3  634.40  697.987  663.00  733.10  3761.6  1000

Complete code for reference:

library(dplyr)
library(purrr)
library(data.table)
library(abind)
library(microbenchmark)

myList <- list()
df1 <- as.data.frame(matrix(1:6, nrow=3, ncol=2))
df2 <- as.data.frame(matrix(7:15, nrow=3, ncol=3))
myList[[1]]<-df1
myList[[2]]<-df2

microbenchmark(
  camille_purrr_dplyr = {
    myList %>%
      map_dfr(tibble::rownames_to_column, var = "id") %>%
      mutate_at(vars(-id), ~ifelse(is.na(.), 0, .)) %>%
      split(.$id) %>%
      map(select, -id)
  }
  ,
  cole_DT_abind = {
  myListDT <- copy(myList)
  all_cols <- Reduce(base::union, lapply(myListDT, names))

  # data.table used for side effects of updating-by-reference in lapply
  lapply(myListDT, setDT)

  # add non-existing columns
  lapply(myListDT,
         function(DT) {
           DT[, base::setdiff(all_cols, names(DT)) := 0]
           setorderv(DT, all_cols)
         })

  # abind is used to make an array
  myArray <- abind(myListDT, along = 3)

  # aperm is used to transpose the array to the preferred route
  myPermArray <- aperm(myArray, c(3,2,1))
  # myPermArray

  #or as a list of data.frames
  apply(myPermArray, 3, data.frame)
  }
  ,
  cole_base = {
    myListBase <- myList

    all_cols <- Reduce(base::union, lapply(myListBase, names))

    myListBase <- lapply(myListBase, 
                         function(DF){
                           DF[, base::setdiff(all_cols, names(DF))] <- 0
                           DF[, all_cols]
                         }
                         )

    myArrayBase <- array(unlist(myListBase, use.names = F),
                         dim = c(nrow(myListBase[[1]]), length(all_cols), length(myListBase)),
                         dimnames = list(NULL, all_cols, NULL))

    myPermBase <- aperm(myArrayBase, c(3,2,1))
    apply(myPermBase, 3, data.frame)
  }
  # ,
  # cole_base_aperm = {
  #   myListBase <- myList
  #   
  #   all_cols <- Reduce(base::union, lapply(myListBase, names))
  #   
  #   myListBase <- lapply(myListBase, 
  #                        function(DF){
  #                          DF[, base::setdiff(all_cols, names(DF))] <- 0
  #                          DF[, all_cols]
  #                        }
  #   )
  #   
  #   myArrayABind <- abind(myListBase, along = 3)
  #   
  #   myPermBase <- aperm(myArrayABind, c(3,2,1))
  #   apply(myPermBase, 3, data.frame)
  # }
, times = 1000
)

One way with a few dplyr & purrr functions is to add an ID column to each row in each data frame, bind them all, then split by that ID. The base rbind would throw an error because of the mismatched column names, but dplyr::bind_rows takes a list of any number of data frames and adds NA columns for anything missing.

First step gets you one data frame:

library(dplyr)
library(purrr)

myList %>%
  map_dfr(tibble::rownames_to_column, var = "id")
#>   id V1 V2 V3
#> 1  1  1  4 NA
#> 2  2  2  5 NA
#> 3  3  3  6 NA
#> 4  1  7 10 13
#> 5  2  8 11 14
#> 6  3  9 12 15

Fill in the NA s with 0 in all columns except the ID—this could also be adjusted if need be. Split by ID, and drop the ID column since you no longer need it.

myList %>%
  map_dfr(tibble::rownames_to_column, var = "id") %>%
  mutate_at(vars(-id), ~ifelse(is.na(.), 0, .)) %>%
  split(.$id) %>%
  map(select, -id)
#> $`1`
#>   V1 V2 V3
#> 1  1  4  0
#> 4  7 10 13
#> 
#> $`2`
#>   V1 V2 V3
#> 2  2  5  0
#> 5  8 11 14
#> 
#> $`3`
#>   V1 V2 V3
#> 3  3  6  0
#> 6  9 12 15

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM