简体   繁体   English

R 将 List 转换为 Data.Frame 或 Table

[英]R Convert List Into Data.Frame or Table

list1 = list(
  c(4,5,6,7,1,1,1,1,3,1,3,3),
  c(3,4,5,6,2,2,2,2,1,4,2,1),
  c(1,2,3,4,1,1,1,1,3,2,1,1),
  c(5,6,7,8,1,1,1,1,4,4,4,3),
  c(2,3,4,5,2,2,2,2,2,1,2,1)
)

data1=data.frame("ID"=c(1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5),
"Time"=c(1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4),
"Grade"=c(4,5,6,7,3,4,5,6,1,2,3,4,5,6,7,8,2,3,4,5),
"Class"=c(1,1,1,1,2,2,2,2,1,1,1,1,1,1,1,1,2,2,2,2),
"Score"=c(3,1,3,3,1,4,2,1,3,2,1,1,4,4,4,3,2,1,2,1))

I have 'list1' Each item in 'list1' equals to a single individual's Grade, Class, Score for 4 years.我有'list1''list1'中的每一项都等于一个人的成绩,Class,4年的分数。 So 'list1' has 5 students and 12 records for each student (4 records for every of three variables, Grade and Class and Score).所以“list1”有 5 个学生和每个学生的 12 条记录(三个变量中的每一个都有 4 条记录,Grade 和 Class 和 Score)。 I wish to turn 'list1' into 'data1' that is a long data file where 'ID' equals to the list item number in 'list1'.我希望将“list1”转换为“data1”,这是一个长数据文件,其中“ID”等于“list1”中的列表项编号。 Time equals to time of the record (every student has 4 time measures), Grade equals to the first 4 data points in ALL elements in list1, Class the next 4, and Score the last 4. Time 等于记录的时间(每个学生有 4 个时间度量),Grade 等于 list1 中所有元素中的前 4 个数据点,Class 是接下来的 4 个,Score 是最后 4 个。

Sample output is shown turning 'list1' into desired output 'data1'.示例 output 显示将“list1”转换为所需的 output“data1”。

This data set is HUGE so I am hoping for a efficient approach to doing this conversion.这个数据集是巨大的,所以我希望有一种有效的方法来进行这种转换。

I'm not sure it'll be efficient but it's concise:我不确定它是否有效,但它很简洁:

setDT(list1)
# could also do something like paste0('student', 1:5) for clarity,
#   and adjust patterns() below accordingly
setnames(list1, paste0(1:5))
# 4 = # of values of Time
list1[ , colid := rep(c('Grade', 'Class', 'Score'), each = 4L)]
# 3 = # of columns "stacked" in each student's column initially
list1[ , Time := rep(1:4, 3L)]
# first, reshape long
list1[ , melt(.SD, measure.vars = patterns('^[0-9]+'), variable.name = 'ID',
              variable.factor = FALSE)
       # now, reshape to the final format
       ][ , dcast(.SD, ID + Time ~ colid, value.var = 'value')]
#         ID  Time Class Grade Score
#     <char> <int> <num> <num> <num>
#  1:      1     1     1     4     3
#  2:      1     2     1     5     1
#  3:      1     3     1     6     3
#  4:      1     4     1     7     3
#  5:      2     1     2     3     1
#  6:      2     2     2     4     4
#  7:      2     3     2     5     2
#  8:      2     4     2     6     1
#  9:      3     1     1     1     3
# 10:      3     2     1     2     2
# 11:      3     3     1     3     1
# 12:      3     4     1     4     1
# 13:      4     1     1     5     4
# 14:      4     2     1     6     4
# 15:      4     3     1     7     4
# 16:      4     4     1     8     3
# 17:      5     1     2     2     2
# 18:      5     2     2     3     1
# 19:      5     3     2     4     2
# 20:      5     4     2     5     1
#         ID  Time Class Grade Score

The inefficiency would come from having two operations here.效率低下将来自这里有两个操作。

The approach of building the table skeleton first, then populating it may be faster, like this:先构建表格骨架,然后填充它的方法可能更快,如下所示:

# 4 = # of Times per ID&Column (assuming your table is rectangular)
out = CJ(ID = 1:length(list1), Time = 1:4)
# relies on ID being an integer, so that ID = 1 --> list1[[1]]
#   gives ID=1's data
out[ , by = ID, c('Grade', 'Class', 'Score') := {
  as.data.table(matrix(list1[[ .BY$ID ]], ncol = 3L))
}]

It may be that as.data.table is also inefficient but this code is more readable than the alternative:可能as.data.table也效率低下,但此代码比替代代码更具可读性:

out = CJ(ID = 1:length(list1), Time = 1:4)
out[ , by = ID, c('Grade', 'Class', 'Score') := {
  student_data = list1[[.BY$ID]]
  lapply(1:3, function(j) student_data[4L*(j-1) + 1:4])
}]

Here's another base solution that is very fast.这是另一个非常快的基本解决方案。 It is less elegant but the idea is that we minimize memory use by filling out a matrix with a loop.它不太优雅,但我们的想法是我们通过用循环填充矩阵来最小化 memory 的使用。

mat = matrix(0, nrow = length(list1) * 4L, ncol = 5L, dimnames = list(NULL, c("ID", "Time", "Grade", "Class", "Score")))

rw = 0L
times = 1:4

for (i in seq_along(list1)) {
  l = list1[[i]]
  new_rw = length(l) / 3
  inds = seq_len(new_rw) + rw

  mat[inds, 1L] = i
  mat[inds, 2L] = times
  mat[inds, 3:5] = matrix(l, ncol = 3L)

  rw = new_rw + rw
}

And here is a faster way which unlists and then makes a matrix by selecting our unlisted elements in a certain order:这是一种更快的方法,它可以通过按特定顺序选择未列出的元素来取消列出然后生成矩阵:

n = length(list1)
matrix(unlist(list1, use.names = FALSE)[rep(rep(1:4, n) + 12 * rep(0:(n-1L), each = 4), 3) + rep(c(0, 4, 8), each = n * 4L)], ncol = 3)

Then finally, if you still need speed, Rcpp can be used:最后,如果你仍然需要速度,可以使用Rcpp

Rcpp::cppFunction(
  " NumericMatrix rcpp_combo(List x) {
  NumericMatrix out(x.size() * 4, 5);
  int init = 0;

  for (int i = 0; i < x.size(); i++) {
    NumericVector tmp = x(i);
    int ID = i + 1;
    for (int j = 0; j < 4; j++) {
      int ind = j + init;

      out(ind, 0) = ID;
      out(ind, 1) = j + 1;
      out(ind, 2) = tmp(j);
      out(ind, 3) = tmp(4 + j);
      out(ind, 4) = tmp(8 + j);
    }
    init += 4;
  }
  return(out);
}"
)
rcpp_combo(list1)    

Using @Sathish's benchmarks, these methods are between 0.05 and 2 seconds.使用@Sathish 的基准,这些方法在 0.05 到 2 秒之间。

big_list <- unlist(mget(x = rep('list1', 100000)), recursive = FALSE)

system.time(rcpp_combo(big_list))
##   user  system elapsed 
##   0.07    0.00    0.06 

system.time({
  n = length(big_list)
  mat2 = matrix(unlist(big_list, use.names = FALSE)[rep(rep(1:4, n) + 12 * rep(0:(n-1L), each = 4), 3) + rep(c(0, 4, 8), each = n * 4L)], ncol = 3)
})
##   user  system elapsed 
##   0.20    0.02    0.22 

big_list <- unlist(mget(x = rep('list1', 100000)), recursive = FALSE)
system.time({
mat = matrix(0, nrow = length(big_list) * 4L, ncol = 5L, dimnames = list(NULL, c("ID", "Time", "Grade", "Class", "Score")))
rw = 0L
times = 1:4
for (i in seq_along(big_list)) {
  l = big_list[[i]]
  new_rw = length(l) / 3
  inds = seq_len(new_rw) + rw
  mat[inds, 1L] = i
  mat[inds, 2L] = times
  mat[inds, 3:5] = matrix(l, ncol = 3L)
  rw = new_rw + rw
}
})
##   user  system elapsed 
##   2.08    0.03    2.21

One purrr and dplyr solution could be:一种purrrdplyr解决方案可能是:

map_dfr(.x = list1, 
        ~ as.data.frame(matrix(.x, 4, 3)) %>%
         setNames(c("Grade", "Class", "Score")), .id = "ID") %>%
 group_by(ID) %>%
 mutate(Time = 1:n())

   ID    Grade Class Score  Time
   <chr> <dbl> <dbl> <dbl> <int>
 1 1         4     1     3     1
 2 1         5     1     1     2
 3 1         6     1     3     3
 4 1         7     1     3     4
 5 2         3     2     1     1
 6 2         4     2     4     2
 7 2         5     2     2     3
 8 2         6     2     1     4
 9 3         1     1     3     1
10 3         2     1     2     2
11 3         3     1     1     3
12 3         4     1     1     4
13 4         5     1     4     1
14 4         6     1     4     2
15 4         7     1     4     3
16 4         8     1     3     4
17 5         2     2     2     1
18 5         3     2     1     2
19 5         4     2     2     3
20 5         5     2     1     4

Using base R, we can iterate over the index of list1 and create a dataframe for each list.使用基础 R,我们可以遍历list1的索引并为每个列表创建 dataframe。

do.call(rbind, lapply(seq_along(list1), function(i) 
        data.frame(ID = i, Time = 1:4, Grade = list1[[i]][1:4], 
                    Class = list1[[i]][5:8], Score = list1[[i]][9:12])))

#   ID Time Grade Class Score
#1   1    1     4     1     3
#2   1    2     5     1     1
#3   1    3     6     1     3
#4   1    4     7     1     3
#5   2    1     3     2     1
#6   2    2     4     2     4
#7   2    3     5     2     2
#8   2    4     6     2     1
#9   3    1     1     1     3
#10  3    2     2     1     2
#11  3    3     3     1     1
#12  3    4     4     1     1
#13  4    1     5     1     4
#14  4    2     6     1     4
#15  4    3     7     1     4
#16  4    4     8     1     3
#17  5    1     2     2     2
#18  5    2     3     2     1
#19  5    3     4     2     2
#20  5    4     5     2     1

Using 10 Million data points使用 1000 万个数据点

Data:数据:

list1 = list(
  c(4,5,6,7,1,1,1,1,3,1,3,3),
  c(3,4,5,6,2,2,2,2,1,4,2,1),
  c(1,2,3,4,1,1,1,1,3,2,1,1),
  c(5,6,7,8,1,1,1,1,4,4,4,3),
  c(2,3,4,5,2,2,2,2,2,1,2,1))

big_list <- unlist(mget(x = rep('list1', 100000)), recursive = FALSE)

Code: - Using Base-R: split()代码:- 使用 Base-R: split()

system.time({
  col_levels <- rep(c('Grade', 'Class', 'Score'), each = 4)

  for(x in seq_along(big_list)){
    big_list[[x]] <- do.call('cbind', list(ID = x, Time = 1:4, 
                                        do.call('cbind', split(big_list[[x]], col_levels))))
  }

  final_df <- do.call('rbind', big_list)      
})

# user  system elapsed 
# 82.86    0.31   83.78

Comparison: Using data.table比较:使用data.table

@MichaelChirico @MichaelChirico

library('data.table')
system.time({
  # 4 = # of Times per ID&Column (assuming your table is rectangular)
  out = CJ(ID = 1:length(big_list), Time = 1:4)
  # relies on ID being an integer, so that ID = 1 --> list1[[1]]
  #   gives ID=1's data
  out[ , by = ID, c('Grade', 'Class', 'Score') := {
    as.data.table(matrix(big_list[[ .BY$ID ]], ncol = 3L))
  }]
})

# user  system elapsed 
# 76.22    0.25   76.80

Output Output

dim(final_df)
# [1] 2000000      5

head(final_df)
#      ID Time Class Grade Score
# [1,]  1    1     1     4     3
# [2,]  1    2     1     5     1
# [3,]  1    3     1     6     3
# [4,]  1    4     1     7     3
# [5,]  2    1     2     3     1
# [6,]  2    2     2     4     4

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM