如果值為 NA，則估算變量 dplyr

Question

如果dt_drug_end缺失（NA），我正在嘗試估算一個變量dt_drug_end 。

邏輯是：
1.如果它是line_number組的一部分，則 groupby line_number where dt_drug_end = max(dt_drug_end) 。
2. 否則，如果開始日期可用，則將結束日期估算為dt_drug_start 。

當 max(dt_drug_end) 為 NA 時，我的代碼不會將結束日期作為開始日期。 即使我使用邏輯 case_when(is.na(dt_drug_end) & is.na(max_dt_drug_end) ~ dt_drug_start) dt_end_date 是 NA 而不是匹配dt_drug_end

patient_id line_number_raw drug_name           dt_drug_start dt_drug_end reason_discount         trt_diag flag_after_diag_adv
  <chr>      <chr>           <chr>               <date>        <date>      <chr>                   <chr>                  <dbl>
1 0Q97V6R3GI 1               Carboplatin         2013-04-12    2013-04-18  Yes, Comment field      C34.3                      1
2 0Q97V6R3GI 1               Cisplatin           2013-04-12    2013-06-16  Completed Treatment     C34.3                      1
3 0Q97V6R3GI 3               Denosumab           2016-12-06    NA          No                      C34.3                      1
4 0Q97V6R3GI 4               Osimertinib         2018-06-14    NA          No                      C34.3                      1
5 0Q97V6R3GI 1               Pemetrexed Disodium 2013-04-12    2013-06-16  Completed Treatment     C34.3                      1
6 0Q97V6R3GI 3               Erlotinib           2015-06-02    2018-05-30  Yes, due to progression C34.9                      1

數據

df <- structure(
list(
    patient_id = c("0Q97V6R3GI","0Q97V6R3GI","0Q97V6R3GI","0Q97V6R3GI","0Q97V6R3GI","0Q97V6R3GI"),
    line_number_raw = c("1","1", "3", "4", "1", "3"),
    drug_name = c(
        "Carboplatin",
        "Cisplatin",
        "Denosumab",
        "Osimertinib",
        "Pemetrexed Disodium",
        "Erlotinib"
    ),
    dt_drug_start = structure(c(15807, 15807, 17141, 17696, 15807,16588), class = "Date"),
    dt_drug_end = structure(c(15813, 15872,NA, NA, 15872, 17681), class = "Date"),
    reason_discount = c(
        "Yes, Comment field",
        "Completed Treatment",
        "No",
        "No",
        "Completed Treatment",
        "Yes, due to progression"
    ),
    trt_diag = c("C34.3", "C34.3", "C34.3", "C34.3", "C34.3", "C34.9"),
    flag_after_diag_adv = c(1, 1, 1, 1, 1, 1)
),
row.names = c(NA,-6L),
class = c("tbl_df", "tbl", "data.frame"))

我的代碼：

tb_episode %>% 
group_by(patient_id, line_number_raw) %>% 
mutate(max_dt_drug_end = case_when(
    line_number_raw == "unknown" ~ NA_Date_,
    TRUE ~ max(dt_drug_end, na.rm = T)
)) %>% 
ungroup() %>% 
mutate(dt_drug_end = case_when(
    is.na(dt_drug_end) & !is.na(max_dt_drug_end) ~ max_dt_drug_end,
    is.na(max_dt_drug_end) ~ dt_drug_start,
    TRUE ~ dt_drug_end
))

Answer 1

我認為您的代碼在技術上還可以，但問題是運行max(dt_drug_end, na.rm = T)返回一個無限值。 這會讓您的is.na檢查出錯。

df %>%
  # Create Helper column with dates based on your rules
  group_by(line_number_raw) %>% 
  mutate(impute_date = max(dt_drug_end, na.rm = T)) %>% 
  ungroup() %>% 
  mutate(impute_date = if_else(is.infinite(impute_date), lubridate::NA_Date_, impute_date)) %>% 
  mutate(impute_date = if_else(is.na(impute_date), dt_drug_start, impute_date) ) %>% 
  
  # impute
  mutate(dt_drug_end =
  if_else(is.na(dt_drug_end),
          impute_date,
          dt_drug_end)
  )

Answer 2

使用dplyr::rowwise()的替代方法。

library(dplyr)

df %>%
  group_by(line_number_raw) %>%
  mutate(several = n() > 1,
         gMax = max(dt_drug_end, na.rm = T)) %>%
  rowwise() %>%
  mutate(dt_drug_end_X = case_when(is.na(dt_drug_end) & several == T ~ gMax,
                                   is.na(dt_drug_end) & several == F ~ dt_drug_start,
                                   T ~ dt_drug_end)) %>%
  select(starts_with(c('dt', 'line')))

# # A tibble: 6 x 4
# # Rowwise: 
#   dt_drug_start dt_drug_end dt_drug_end_X line_number_raw
#   <date>        <date>      <date>        <chr>          
# 1 2013-04-12    2013-04-18  2013-04-18    1              
# 2 2013-04-12    2013-06-16  2013-06-16    1              
# 3 2016-12-06    NA          2018-05-30    3              
# 4 2018-06-14    NA          2018-06-14    4              
# 5 2013-04-12    2013-06-16  2013-06-16    1              
# 6 2015-06-02    2018-05-30  2018-05-30    3

如果值為 NA，則估算變量 dplyr

問題描述

2 個解決方案

解決方案1
0 已采納 2022-01-28 20:23:06

解決方案2
0 2022-01-28 21:23:50

如果值為 NA，則估算變量 dplyr

問題描述

2 個解決方案

解決方案1 0 已采納 2022-01-28 20:23:06

解決方案2 0 2022-01-28 21:23:50

解決方案1
0 已采納 2022-01-28 20:23:06

解決方案2
0 2022-01-28 21:23:50