使用 purrr 根据条件从嵌套的 dataframe 中提取值

Question

I'm working with a set of patient test results some of which are positive and negative.我正在处理一组患者测试结果，其中一些是阳性和阴性的。 I'm reducing to individual patient level using dplyr::nest() and then extracting values for the first positive test only using purrr::map() and a function I've written.我正在使用dplyr::nest()减少到单个患者级别，然后仅使用purrr::map()和我编写的 function 提取第一个阳性测试的值。 My dataset isn't huge - ~40k unique patients, ~110k test results - but I gave up running my script after 40 mins.我的数据集并不大——约 40k 独特的患者，约 110k 测试结果——但我在 40 分钟后放弃了运行我的脚本。 I'm sure there's a better way of extracting these values but am struggling to work it out.我确信有更好的方法来提取这些值，但我正在努力解决这个问题。 The code chunk below illustrates the method I'm using (though obviously this runs in no time).下面的代码块说明了我正在使用的方法（尽管很明显这会立即运行）。

library(tidyverse)

example_data <- tribble(
  
  ~patient, ~is_first_positive, ~score_1, ~score_2,
  "A", F, 10, 45,
  "A", T, 16, 76,
  "A", F, 24, 86,
  "B", T, 17, 5,
  "B", F, 24, 22,
  "B", F, 55, 97,
  "C", F, 67, 48,
  "C", F, 23, 38,
  "C", F, 45, 16
  
)

example_data <- example_data %>% 
  group_by(patient) %>% 
  nest()

# function to extract values based on value of another column
get_field <- function(df, logical_field, rtn_field) {
  
  df <- df %>% filter_(logical_field)
  
  if(nrow(df)==0) {
    return(NA_integer_)
  } else {
    df %>% pull({{rtn_field}}) %>% as.integer() %>% return()
  }

}

# Use purrr to run function against each nested df
example_data <- example_data %>% 
  mutate(first_positive_score1 = map_int(data, ~get_field(., "is_first_positive", score_1)),
         first_positive_score2 = map_int(data, ~get_field(., "is_first_positive", score_2)))

Answer 1

Here's an alternative that should be quite fast:这是一个应该非常快的替代方案：

Filter to keep only first positive score过滤以仅保留第一个正分数
Merge back in the nested data if you need it如果需要，合并回嵌套数据

library(tidyverse)

example_data <- tribble(
  ~patient, ~is_first_positive, ~score_1, ~score_2,
  "A", F, 10, 45,
  "A", T, 16, 76,
  "A", F, 24, 86,
  "B", T, 17, 5,
  "B", F, 24, 22,
  "B", F, 55, 97,
  "C", F, 67, 48,
  "C", F, 23, 38,
  "C", F, 45, 16
)

nested_data <- example_data %>% 
  group_by(patient) %>% 
  nest()

example_data %>% 
  filter(is_first_positive) %>%
  group_by(patient) %>%
  top_n(1) %>%
  full_join(nested_data)

#> Selecting by score_2
#> Joining, by = "patient"
#> # A tibble: 3 x 5
#> # Groups:   patient [3]
#>   patient is_first_positive score_1 score_2 data            
#>   <chr>   <lgl>               <dbl>   <dbl> <list>          
#> 1 A       TRUE                   16      76 <tibble [3 × 3]>
#> 2 B       TRUE                   17       5 <tibble [3 × 3]>
#> 3 C       NA                     NA      NA <tibble [3 × 3]>

Considering your comment, I re-wrote your get_field function using Base R functions and was able to get a 10x speed improvement:考虑到您的评论，我使用 Base R 函数重写了您的get_field function 并且速度提高了 10 倍：

get_field <- function(df, logical_field, rtn_field) {
  df <- df %>% filter_(logical_field)
  if(nrow(df)==0) {
    return(NA_integer_)
  } else {
    df %>% pull({{rtn_field}}) %>% as.integer() %>% return()
  }
}

get_field2 <- function(x, logical_field, rtn_field) {
  x <- x[x[[logical_field]], ]
  ifelse(nrow(x)==0, NA_integer_, x[[rtn_field]])
}

approach1 <- function() {
  example_data %>% 
  mutate(first_positive_score1 = map_int(data, ~get_field(., "is_first_positive", score_1)),
         first_positive_score2 = map_int(data, ~get_field(., "is_first_positive", score_2)))
}

approach2 <- function() {
  example_data %>% 
    mutate(first_positive_score1 = map_dbl(data, get_field2, "is_first_positive", "score_1"),
           first_positive_score2 = map_dbl(data, get_field2, "is_first_positive", "score_2"))
}

library(microbenchmark)
microbenchmark(approach1(), approach2())
#> Unit: milliseconds
#>         expr       min        lq      mean    median        uq       max neval
#>  approach1() 19.849982 24.047509 26.470304 25.001731 26.896622 95.951980   100
#>  approach2()  2.159769  2.587905  2.783555  2.648321  2.740863  7.620581   100

Answer 2

If you can forgive the long lines, you can use map() in the following way.如果您可以原谅长行，您可以按以下方式使用map() 。

library(dplyr)
library(tibble)
library(purrr)

example_data %>%
  mutate(score_1 = as.double(map(data, ~ deframe(.x[2])[which(deframe(.x[1]) == TRUE)])),
         score_2 = as.double(map(data, ~ deframe(.x[3])[which(deframe(.x[1]) == TRUE)]))) 

#   patient data             score_1 score_2
#   <chr>   <list>             <dbl>   <dbl>
# 1 A       <tibble [3 × 3]>      16      76
# 2 B       <tibble [3 × 3]>      17       5
# 3 C       <tibble [3 × 3]>      NA      NA

使用 purrr 根据条件从嵌套的 dataframe 中提取值

问题描述

2 个解决方案

解决方案1
1 2020-10-24 18:36:35

解决方案2
0 已采纳 2020-10-24 20:11:40

使用 purrr 根据条件从嵌套的 dataframe 中提取值

问题描述

2 个解决方案

解决方案1 1 2020-10-24 18:36:35

解决方案2 0 已采纳 2020-10-24 20:11:40

解决方案1
1 2020-10-24 18:36:35

解决方案2
0 已采纳 2020-10-24 20:11:40