簡體   English   中英

“折疊”數據列

[英]"Collapsing" data columns

我可以幫我解決這個混亂的數據集嗎?

以下陳述描述了五名患者的治療 + 開始/停止日期。 Line1Line2Line3描述了施用治療的順序(即第一次治療、第二次治療等)。 但是,您可以看到數據已輸入,因此某些患者在第一列中沒有第一次治療。 例如,ID3 的第一個 tx 是TreatmentD ,但它已經輸入到Line3中。 更復雜的是,在連續處理之間完全跳過了一些列(例如 ID4)。

original_data <- data.frame(
  stringsAsFactors = FALSE,
        patient_id = c("ID1", "ID2", "ID3", "ID4", "ID5"),
           Line1_name = c("TreatmentA", NA, NA, NA, NA),
          Line1_start = c("5/5/17", NA, NA, NA, NA),
           Line1_stop = c("18/8/17", NA, NA, NA, NA),
           Line2_name = c("TreatmentF","TreatmentB",
                       NA,"TreatmentB","TreatmentF"),
          Line2_start = c("6/11/18", "6/6/18", NA, "3/9/18", "15/11/18"),
           Line2_stop = c("19/12/18", "12/12/18", NA, "22/2/19", "15/6/19"),
           Line3_name = c("TreatmentC", NA, "TreatmentD", NA, "TreatmentC"),
          Line3_start = c("13/2/19", NA, "24/11/17", NA, "29/6/19"),
           Line3_stop = c("2/4/19", NA, "3/4/18", NA, "15/9/19"),
           Line4_name = c(NA, NA, NA, "TreatmentA", NA),
          Line4_start = c(NA, NA, NA, "22/2/19", NA),
           Line4_stop = c(NA, NA, NA, "8/7/19", NA),
           Line5_name = c(NA, NA, NA, NA, "TreatmentE"),
          Line5_start = c(NA, NA, NA, NA, "15/1/20"),
           Line5_stop = c(NA, NA, NA, NA, "20/5/20")
)
head(original_data)
#>   patient_id Line1_name Line1_start Line1_stop Line2_name Line2_start
#> 1        ID1 TreatmentA      5/5/17    18/8/17 TreatmentF     6/11/18
#> 2        ID2       <NA>        <NA>       <NA> TreatmentB      6/6/18
#> 3        ID3       <NA>        <NA>       <NA>       <NA>        <NA>
#> 4        ID4       <NA>        <NA>       <NA> TreatmentB      3/9/18
#> 5        ID5       <NA>        <NA>       <NA> TreatmentF    15/11/18
#>   Line2_stop Line3_name Line3_start Line3_stop Line4_name Line4_start
#> 1   19/12/18 TreatmentC     13/2/19     2/4/19       <NA>        <NA>
#> 2   12/12/18       <NA>        <NA>       <NA>       <NA>        <NA>
#> 3       <NA> TreatmentD    24/11/17     3/4/18       <NA>        <NA>
#> 4    22/2/19       <NA>        <NA>       <NA> TreatmentA     22/2/19
#> 5    15/6/19 TreatmentC     29/6/19    15/9/19       <NA>        <NA>
#>   Line4_stop Line5_name Line5_start Line5_stop
#> 1       <NA>       <NA>        <NA>       <NA>
#> 2       <NA>       <NA>        <NA>       <NA>
#> 3       <NA>       <NA>        <NA>       <NA>
#> 4     8/7/19       <NA>        <NA>       <NA>
#> 5       <NA> TreatmentE     15/1/20    20/5/20

問題:有沒有一種方法可以“折疊”數據,這樣就不會跳過任何列,並且所有數據都“左移”到最早的處理#槽是空的? 我嘗試使用dplyr::coalesce() function,但是雖然我可以將Line2合並到Line1 ,但我無法將Line3合並到Line2 ,因為原始Line2內容仍然存在(抱歉,有點難以解釋)。 我覺得它可能是正確的 function 雖然......

我的目標是做到這一點

final_data <- data.frame(
  stringsAsFactors = FALSE,
        patient_id = c("ID1", "ID2", "ID3", "ID4", "ID5"),
           Line1_name = c("TreatmentA", "TreatmentB", "TreatmentD", "TreatmentB", "TreatmentF"),
          Line1_start = c("5/5/17", "6/6/18", "24/11/17", "3/9/18", "15/11/18"),
           Line1_stop = c("18/8/17", "12/12/18", "3/4/18", "22/2/19", "15/6/19"),
           Line2_name = c("TreatmentF",NA,
                       NA,"TreatmentA","TreatmentC"),
          Line2_start = c("6/11/18", NA, NA, "22/2/19", "29/6/19"),
           Line2_stop = c("19/12/18", NA, NA, "8/7/19", "15/9/19"),
           Line3_name = c("TreatmentC", NA, NA, NA, "TreatmentE"),
          Line3_start = c("13/2/19", NA, NA, NA, "15/1/20"),
           Line3_stop = c("2/4/19", NA, NA, NA, "20/5/20"),
           Line4_name = c(NA, NA, NA, NA, NA),
          Line4_start = c(NA, NA, NA, NA, NA),
           Line4_stop = c(NA, NA, NA, NA, NA),
           Line5_name = c(NA, NA, NA, NA, NA),
          Line5_start = c(NA, NA, NA, NA, NA),
           Line5_stop = c(NA, NA, NA, NA, NA)
)
head(final_data)
#>   patient_id Line1_name Line1_start Line1_stop Line2_name Line2_start
#> 1        ID1 TreatmentA      5/5/17    18/8/17 TreatmentF     6/11/18
#> 2        ID2 TreatmentB      6/6/18   12/12/18       <NA>        <NA>
#> 3        ID3 TreatmentD    24/11/17     3/4/18       <NA>        <NA>
#> 4        ID4 TreatmentB      3/9/18    22/2/19 TreatmentA     22/2/19
#> 5        ID5 TreatmentF    15/11/18    15/6/19 TreatmentC     29/6/19
#>   Line2_stop Line3_name Line3_start Line3_stop Line4_name Line4_start
#> 1   19/12/18 TreatmentC     13/2/19     2/4/19         NA          NA
#> 2       <NA>       <NA>        <NA>       <NA>         NA          NA
#> 3       <NA>       <NA>        <NA>       <NA>         NA          NA
#> 4     8/7/19       <NA>        <NA>       <NA>         NA          NA
#> 5    15/9/19 TreatmentE     15/1/20    20/5/20         NA          NA
#>   Line4_stop Line5_name Line5_start Line5_stop
#> 1         NA         NA          NA         NA
#> 2         NA         NA          NA         NA
#> 3         NA         NA          NA         NA
#> 4         NA         NA          NA         NA
#> 5         NA         NA          NA         NA

謝謝!

有趣的問題!

library(dplyr)
library(tidyr)

original_data %>%
  pivot_longer(starts_with("Line"),
               names_sep = "_",
               names_to = c("line", "prop"),
               values_drop_na = T) %>%
  group_by(patient_id) %>%
  mutate(line = as.numeric(factor(line))) %>%
  ungroup() %>%
  mutate(name = sprintf("Line%s_%s", line, prop)) %>%
  pivot_wider(id_cols = patient_id, names_from = name, values_from = value)

返回:

# A tibble: 5 x 10
  patient_id Line1_name Line1_start Line1_stop Line2_name Line2_start Line2_stop
  <chr>      <chr>      <chr>       <chr>      <chr>      <chr>       <chr>
1 ID1        TreatmentA 5/5/17      18/8/17    TreatmentF 6/11/18     19/12/18
2 ID2        TreatmentB 6/6/18      12/12/18   NA         NA          NA
3 ID3        TreatmentD 24/11/17    3/4/18     NA         NA          NA
4 ID4        TreatmentB 3/9/18      22/2/19    TreatmentA 22/2/19     8/7/19
5 ID5        TreatmentF 15/11/18    15/6/19    TreatmentC 29/6/19     15/9/19
# … with 3 more variables: Line3_name <chr>, Line3_start <chr>,
#   Line3_stop <chr>

您可以將所有非 NA 值向左移動 -

original_data[] <- t(apply(original_data, 1,function(x) na.omit(x)[1:length(x)]))

#  patient_id Line1_name Line1_start Line1_stop Line2_name Line2_start Line2_stop
#1        ID1 TreatmentA      5/5/17    18/8/17 TreatmentF     6/11/18   19/12/18
#2        ID2 TreatmentB      6/6/18   12/12/18       <NA>        <NA>       <NA>
#3        ID3 TreatmentD    24/11/17     3/4/18       <NA>        <NA>       <NA>
#4        ID4 TreatmentB      3/9/18    22/2/19 TreatmentA     22/2/19     8/7/19
#5        ID5 TreatmentF    15/11/18    15/6/19 TreatmentC     29/6/19    15/9/19

#  Line3_name Line3_start Line3_stop Line4_name Line4_start Line4_stop Line5_name
#1 TreatmentC     13/2/19     2/4/19       <NA>        <NA>       <NA>       <NA>
#2       <NA>        <NA>       <NA>       <NA>        <NA>       <NA>       <NA>
#3       <NA>        <NA>       <NA>       <NA>        <NA>       <NA>       <NA>
#4       <NA>        <NA>       <NA>       <NA>        <NA>       <NA>       <NA>
#5 TreatmentE     15/1/20    20/5/20       <NA>        <NA>       <NA>       <NA>

#  Line5_start Line5_stop
#1        <NA>       <NA>
#2        <NA>       <NA>
#3        <NA>       <NA>
#4        <NA>       <NA>
#5        <NA>       <NA>

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM