繁体   English   中英

难以进行数据处理以填充R数据帧中的值

[英]Difficult data manipulation to fill values in an R dataframe

我手头似乎似乎很难向量化/加速问题。 我能够通过缓慢缩放的嵌套for循环解决此问题。 我正在使用的数据是NBA拥有数据,我正在对多个NBA赛季(100K-1M)行进行分析,这在嵌套循环中非常慢(数分钟)。 我创建了测试数据来突出问题:

mydf = data.frame(id1 = c(100, 100, 100, 150, 150, 150),
              id2 = c(110, 110, 110, 122, 122, 122),
              P1 = c(1, 1, 1, 1, 2, 2),
              P2 = c(2, 2, 2, 3, 3, 3),
              O1 = c(4, 4, 4, 4, 4, 4),
              O2 = c(5, 5, 6, 6, 6, 6), 
              A1 = 0, 
              A2 = 0, 
              A3 = 0, 
              A4 = 0, 
              A5 = 0,
              A6 = 0)

此数据框中有6个唯一用户(1-6),其ID出现在P1,P2,O1和O2列中。 每个用户还获得自己的列(A1-A6)。 每次用户出现在P1或P2列中的一行中时,其相应的列将得到1。每次用户出现在O1或O2列中中的一行时,其相应的列将得到-1。 我用于解决此问题的for循环如下:

for (i in 1:nrow(mydf)) {
  for (j in 3:4) {
    tmp = paste0("A",as.character(mydf[i,j]))
    mydf[i, which(colnames(mydf) == tmp)] = 1
  }

  for (j in 5:6) {
    tmp = paste0("A",as.character(mydf[i,j]))
    mydf[i, which(colnames(mydf) == tmp)] = -1
  }
}

我的实际数据帧具有P1-P5,O1-O5和大约300个唯一的播放器ID。 关于如何加快速度有任何想法吗?

谢谢!

根据您的样本数据,这应该起作用:

library(dplyr); library(tidyr); library(tibble)

mydf.calculated <- mydf %>%

  # make row names explicit so that we can join back by row later
  rownames_to_column("row.id") %>%
  select(row.id, starts_with("P"), starts_with("O")) %>%

  # convert to long format & define calculation based on whether P or O
  gather(operation, A, -row.id) %>%
  mutate(calculation = ifelse(grepl("P", operation), 1, -1)) %>%

  # if there are multiple P and/or O operations on the same user in the same row,
  # collapse into final calculated result
  group_by(row.id, A) %>%
  summarise(calculation = sum(calculation)) %>% 
  ungroup() %>%

  # spread calculated results to respective user columns
  mutate(A = paste0("A", A)) %>%
  spread(A, calculation, fill = 0) %>%

  # sort in original row order
  arrange(row.id) %>% select(-row.id)

# combine results
cbind(mydf %>% select(-starts_with("A")),
      mydf.calculated)

  id1 id2 P1 P2 O1 O2 A1 A2 A3 A4 A5 A6
1 100 110  1  2  4  5  1  1  0 -1 -1  0
2 100 110  1  2  4  5  1  1  0 -1 -1  0
3 100 110  1  2  4  6  1  1  0 -1  0 -1
4 150 122  1  3  4  6  1  0  1 -1  0 -1
5 150 122  2  3  4  6  0  1  1 -1  0 -1
6 150 122  2  3  4  6  0  1  1 -1  0 -1

效率不是特别高,但是可以:

cA <- col(mydf[,7:12])
mydf[,7:12] <- (cA==mydf$P1)+(cA==mydf$P2)-(cA==mydf$O1)-(cA==mydf$O2)

mydf
  id1 id2 P1 P2 O1 O2 A1 A2 A3 A4 A5 A6
1 100 110  1  2  4  5  1  1  0 -1 -1  0
2 100 110  1  2  4  5  1  1  0 -1 -1  0
3 100 110  1  2  4  6  1  1  0 -1  0 -1
4 150 122  1  3  4  6  1  0  1 -1  0 -1
5 150 122  2  3  4  6  0  1  1 -1  0 -1
6 150 122  2  3  4  6  0  1  1 -1  0 -1

样本数据集上的版本速度大约是您的版本的两倍; 缺点是您需要指定玩家人数

for (i in 1:6) { 
  mydf[paste0("A", i)] <- (i==mydf$P1 | i==mydf$P2) * 1 - 1*
                                   (i==mydf$O1 | i==mydf$O2)
}

这只是样本上的一点点速度,但是更容易适应不同数量的P / O列:

playercols <- function(mydf, nplayers, plus, minus) {
  for (i in 1:nplayers) {
    mydf[paste0("A", i)] <- rowSums(i==mydf[, plus]) - 
      rowSums(i==mydf[, minus])
  }
  mydf
}

playercols(mydf, 6, 3:4, 5:6)

测量了其他答案的运行时间后,这可能是最快的。 这是@Glen_b答案的修改版本,可灵活用于非顺序ID:

vals <- gsub("^A","",names(mydf)[grep("^A",names(mydf))]),
cA <- data.frame(sapply(vals,function(i) rep(i,length(vals)))),
mydf[,grep("A",names(mydf))] <- (cA==mydf$P1)+(cA==mydf$P2)-(cA==mydf$O1)-(cA==mydf$O2)

输出:

  id1 id2 P1 P2 O1 O2 A1 A7 A3 A8 A5 A10
1 100 110  1  7 10  5  1  1  0  0 -1  -1
2 100 110  1  7 10  5  1  1  0  0 -1  -1
3 100 110  5  7  1  8 -1  1  0 -1  1   0
4 150 122  1 10  7  8  1 -1  0 -1  0   1
5 150 122  3  3  5  7  0 -1  2  0 -1   0
6 150 122  3  8  3  5  0  0  0  1 -1   0

这是我编辑的示例数据,其中包括非顺序ID:

mydf = data.frame(id1 = c(100, 100, 100, 150, 150, 150),
              id2 = c(110, 110, 110, 122, 122, 122),
              P1 = c(1, 1, 5, 1, 3, 3),
              P2 = c(7, 7, 7, 10, 3, 8),
              O1 = c(10, 10, 1, 7, 5, 3),
              O2 = c(5, 5, 8, 8, 7, 5), 
              A1 = 0, 
              A7 = 0, 
              A3 = 0, 
              A8 = 0, 
              A5 = 0,
              A10 = 0)

要测量运行时间,可以使用像microbenchmark这样的包:

require(microbenchmark)

microbenchmark(
  vals <- gsub("^A","",names(mydf)[grep("^A",names(mydf))]),
  cA <- data.frame(sapply(vals,function(i) rep(i,length(vals)))),
  mydf[,grep("A",names(mydf))] <- (cA==mydf$P1)+(cA==mydf$P2)-(cA==mydf$O1)-(cA==mydf$O2)
)

Unit: microseconds
     min        lq       mean    median       uq      max neval cld
  19.263   27.4365   44.48546   37.4500   48.158  150.556   100   a  
 460.698  555.1930  869.30677  692.5255 1004.787 3343.197   100   b 
1378.804 1656.6080 2815.49635 2140.1545 3216.846 8664.538   100   c

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM