R - 基于另一个加速子集数据表？

Question

我有一个数据表dat1其中包含多个站点的每日降雨量测量值：

> dat1
              date   ID value
     1: 2000-03-01 1559     0
     2: 2000-03-02 1559     0
     3: 2000-03-03 1559     0
     4: 2000-03-04 1559     0
     5: 2000-03-05 1559     0
    ---                      
106178: 2019-12-27 1322     2
106179: 2019-12-28 1322     1
106180: 2019-12-29 1322     2
106181: 2019-12-30 1322     2
106182: 2019-12-31 1322     0

我也有其他的数据表dat2 ，每个站点dat1与一些邻居的网站，它们之间的距离，以及他们所共有的测量日期一起：

> dat2
     ID1  ID2      dist common_date_begin common_date_end diff_days
 1: 1549 1550 490774.05        2010-02-23      2017-06-16      2670
 2: 1549 1551 290832.68        2010-02-23      2017-06-16      2670
 3: 1549 1552  87750.38        2006-02-01      2017-06-16      4153
 4: 1549 1553 138531.18        2006-02-01      2017-06-16      4153
 5: 1549 1554 103870.34        2000-03-01      2017-06-16      6316
 6: 1549 1555 112919.70        2000-03-01      2017-06-16      6316
 7: 1549 1556  19625.65        2000-03-01      2017-06-16      6316
 8: 1549 1557 398693.43        2000-03-01      2017-06-16      6316
 9: 1549 1558  73514.23        2000-03-01      2017-06-16      6316
10: 1549 1559 129691.63        2000-03-01      2017-06-16      6316

对于dat2每个ID1 - ID2对，我想对dat2这些站进行子集dat1并计算两个站点之间的相关性。

以下代码实现了我所需要的：

library(data.table)
dat1 <- fread("https://www.dropbox.com/s/d2s61du255vzu7g/dat1.csv?dl=1") # ~2 MB
dat2 <- fread("https://www.dropbox.com/s/7n0z0gbeoifss4j/dat2.csv?dl=1") # ~5 KB

# fix column classes
dat1$date <- as.Date(dat1$date)
dat1$ID <- as.character(dat1$ID)
dat2[, (c("common_date_begin","common_date_end")) := lapply(.SD, as.Date), .SDcols = c("common_date_begin","common_date_end")]
dat2[, (c("ID1","ID2")) := lapply(.SD, as.character), .SDcols = c("ID1","ID2")]

# get list of unique stations
ids <- unique(dat2$ID1)

# initialize matrix to hold correlations
correlations <- matrix(NA, nrow = nrow(dat2), ncol=1)

# initialize data frame to hold results
results <- as.data.frame(dat2[, -c(4:5)])

# initialize loop counters
x <- 1

# loop over the main ID's
for (i in ids) {

  tmp <- dat2[ID1==i]

  #loop over the ID's of the neighbour stations
  for (id in 1:nrow(tmp)){

    # get ID of the neighbours
    near_id <- as.numeric(tmp[id, 2])

    # get common dates
    beg_date <- tmp[id, 4]
    end_date <- tmp[id, 5]

    # calculate correlations
    correlations[x,1] <- cor(dat1[ID==i & date %between% c(beg_date, end_date)]$value,
                             dat1[ID==near_id & date %between% c(beg_date, end_date)]$value)
    # increment loop counter
    x <- x + 1
  }
}

# assemble final data frame
results <- data.table(ID=results[, 1],
                      ID_nearest=results[, 2],
                      distance=results[, 3],
                      overlapping_days=results[, 4],
                      correl=as.vector(correlations))

对于这个玩具示例，它的工作速度非常快。 但是，它永远需要我真正的 400 万行数据表。 我等了大约 3 个小时，还没有看到处理结束。

所以我的问题是：有没有办法加快上述代码的速度，不包括将它翻译成 C++？ 由于我有 32 个内核可用，代码是否符合任何多核方法？ mclapply还是foreach ？ 关于任何一个的任何开始提示？

Answer 1

通过使用join s 将 dat2 中的 id-pairs 与 dat1 中的测量值相匹配，而不是for循环，可以加快您的代码速度。 顺便说一句：至少在我看来，使用连接使代码也更加简洁和干净。 而且更健壮。

进一步注意：我在您的代码中发现了一个错误。 使用correlations[x,1]设置相关correlations[x,1]矩阵会导致将相关性分配给错误的 id 对。

基准

为了与您的方法进行比较，我设置了两个函数： cor_join实现了这个想法的简单 tidyverse 方法， cor_loop是代码的包装器。

对这两个函数进行微基准测试表明，使用 joins 可将计算速度提高约 2.5 倍。 我对data.table不熟悉，但我猜想使用data.table或dtplyr （ dtplyr数据表后端）可以进一步提高性能，尤其是在您的真实数据集上。

library(data.table)
library(dplyr)
library(purrr)
library(ggplot2)
library(microbenchmark)

dat1 <- fread("https://www.dropbox.com/s/d2s61du255vzu7g/dat1.csv?dl=1") # ~2 MB
dat2 <- fread("https://www.dropbox.com/s/7n0z0gbeoifss4j/dat2.csv?dl=1") # ~5 KB

# fix column classes
dat1$date <- as.Date(dat1$date)
dat1$ID <- as.character(dat1$ID)
dat2[, (c("common_date_begin","common_date_end")) := lapply(.SD, as.Date), .SDcols = c("common_date_begin","common_date_end")]
dat2[, (c("ID1","ID2")) := lapply(.SD, as.character), .SDcols = c("ID1","ID2")]

cor_join <- function(dat1, dat2) {
  # We want to get a dataframe with 
  #
  # 1. pairs of sites, 
  # 2. dates where we have measurements for both
  # 3. the measurements at each site
  #
  # This could be achieved via left_joins
  dat3 <- dat2 %>%
    # Join dates and measurements for ID1
    left_join(dat1, by = c("ID1" = "ID")) %>% 
    rename(value1 = value) %>% 
    # Join dates and measurements for ID2 on the same date
    left_join(dat1, by = c("ID2" = "ID", "date" = "date")) %>% 
    rename(value2 = value, ID = ID1, ID_nearest = ID2)
  dat3

  # Compute correlations
  dat3 %>%
    # Drop missings, i.e. observations with no common dates
    filter(date >= common_date_begin & date <= common_date_end) %>% 
    group_by(ID, ID_nearest, dist, common_date_begin, common_date_end, diff_days) %>% 
    summarise(correl = cor(value1, value2)) %>% 
    ungroup()
}

cor_loop <- function(dat1, dat2) {
  # get list of unique stations
  ids <- unique(dat2$ID1)

  # initialize matrix to hold correlations
  correlations <- matrix(NA, nrow = nrow(dat2), ncol=1)

  # initialize data frame to hold results
  results <- as.data.frame(dat2[, -c(4:5)])

  # initialize loop counters
  x <- 1

  # loop over the main ID's
  for (i in ids) {

    tmp <- dat2[ID1==i]

    #loop over the ID's of the neighbour stations
    for (id in 1:nrow(tmp)){

      # get ID of the neighbours
      near_id <- as.numeric(tmp[id, 2])

      # get common dates
      beg_date <- tmp[id, 4]
      end_date <- tmp[id, 5]

      # calculate correlations
      correlations[x,1] <- cor(dat1[ID==i & date %between% c(beg_date, end_date)]$value,
                               dat1[ID==near_id & date %between% c(beg_date, end_date)]$value)


      # increment loop counter
      x <- x + 1
    }
  }

  # assemble final data frame
  results <- data.table(ID=results[, 1],
                        ID_nearest=results[, 2],
                        distance=results[, 3],
                        overlapping_days=results[, 4],
                        correl=as.vector(correlations))
  results
}

# microbenchmark
microbenchmark::microbenchmark(cor_join(dat1, dat2), cor_loop(dat1, dat2), times = 10)
#> Unit: milliseconds
#>                  expr      min       lq     mean   median       uq      max
#>  cor_join(dat1, dat2) 247.4106 286.1556 301.6367 296.6921 302.2751 400.8654
#>  cor_loop(dat1, dat2) 773.5274 784.9197 807.3767 798.4800 842.3080 854.1716
#>  neval
#>     10
#>     10

检查结果

为了检查两个函数是否给出相同的结果，我制作了一个散点图

# Check result
results <- list(join = cor_join(dat1, dat2), loop = cor_loop(dat1, dat2))

# Plot
check <- results %>% 
  purrr::reduce(left_join, by = c("ID", "ID_nearest"), suffix = c("_join", "_loop"))
check %>% 
  ggplot(aes(correl_join, correl_loop, color = ID)) +
  geom_point()

OOPS：散点图显示不同的结果？ 为了检查我是否使用了一个简单的数据集，我过滤了站点 1183、1550 和 1551 的数据集：

dat1a <- dat1 %>% filter(ID %in% c(1183, 1550, 1551)) %>% as.data.table()
dat2a <- dat2 %>% filter(ID1 %in% c(1183, 1550, 1551), ID2 %in% c(1183, 1550, 1551)) %>% as.data.table()

# For the simple dataset I get the same correlations
cor_join(dat1a, dat2a)
#> # A tibble: 2 x 7
#>   ID    ID_nearest     dist common_date_begin common_date_end diff_days  correl
#>   <chr> <chr>         <dbl> <date>            <date>              <int>   <dbl>
#> 1 1183  1550       1576360. 2010-02-23        2017-06-16           2670  0.0456
#> 2 1183  1551       1513356. 2010-02-23        2017-06-16           2670 -0.0251
cor_loop(dat1a, dat2a) 
#>      ID ID_nearest distance overlapping_days      correl
#> 1: 1183       1550  1576360             2670  0.04564506
#> 2: 1183       1551  1513356             2670 -0.02513991

检查您的代码后，我猜测差异是由于correlations[x,1]将相关性分配给错误的 id-pairs 引起的。 为了检查我调整了cor_loop 。 除了返回 df results ，它还返回第二个 df correlations2 ，它在循环中设置，不仅包含相关性，还包含id和near_id的相应值：

cor_loop_check <- function(dat1, dat2) {
  # get list of unique stations
  ids <- unique(dat2$ID1)

  # initialize matrix to hold correlations
  correlations <- matrix(NA, nrow = nrow(dat2), ncol=1)
  correlations2 <- data.frame(id1 = rep(NA, nrow(dat2)), 
                              id2 = rep(NA, nrow(dat2)), 
                              correl = rep(NA, nrow(dat2)))

  # initialize data frame to hold results
  results <- as.data.frame(dat2[, -c(4:5)])

  # initialize loop counters
  x <- 1

  # loop over the main ID's
  for (i in ids) {

    tmp <- dat2[ID1==i]

    #loop over the ID's of the neighbour stations
    for (id in 1:nrow(tmp)){

      # get ID of the neighbours
      near_id <- as.numeric(tmp[id, 2])

      # get common dates
      beg_date <- tmp[id, 4]
      end_date <- tmp[id, 5]

      # calculate correlations
      correlations[x,1] <- cor(dat1[ID==i & date %between% c(beg_date, end_date)]$value,
                               dat1[ID==near_id & date %between% c(beg_date, end_date)]$value)

      # Put correlation in df together with current id and near id
      correlations2[x, "id1"] <- i
      correlations2[x, "id2"] <- near_id
      correlations2[x, "correl"] <- correlations[x,1]

      # increment loop counter
      x <- x + 1
    }
  }

  # assemble final data frame
  results <- data.table(ID=results[, 1],
                        ID_nearest=results[, 2],
                        distance=results[, 3],
                        overlapping_days=results[, 4],
                        correl=as.vector(correlations))

  list(results, correlations2)
}


results_check <- cor_loop_check(dat1, dat2)

# Check results for e.g. row 20: Same value for correlation but differing id-pair ):
results_check[[1]][20,]
#>      ID ID_nearest distance overlapping_days      correl
#> 1: 1315       1551  1193032             2670 -0.06323207
results_check[[2]][20,]
#>     id1  id2      correl
#> 20 1315 1559 -0.06323207

^{由reprex 包(v0.3.0) 于 2020 年 3 月 14 日创建}

正如你看到的。 在第 20 行中，两个 df 包含相同的相关性但不同的 ID 对。

R - 基于另一个加速子集数据表？

问题描述

1 个解决方案

解决方案1
2 2020-03-14 11:14:00

基准

检查结果

R - 基于另一个加速子集数据表？

问题描述

1 个解决方案

解决方案1 2 2020-03-14 11:14:00

基准

检查结果

解决方案1
2 2020-03-14 11:14:00