簡體   English   中英

使用 mutate 拆分多列 - dplyr

[英]Splitting multiple columns using mutate - dplyr

我正在嘗試拆分多個列,但很難有效地做到這一點。

我有一個df:

> dput(df)
structure(list(UNIQUE_PATIENT_ID = c("DIS-1101-1001-E1", "DIS-1101-1002-E1", 
"DIS-1101-1003-E1", "DIS-1101-1004-E1", "DIS-1101-1004-E2", "DIS-1101-1005-E1", 
"DIS-1101-1006-E1", "DIS-1101-1007-E1", "DIS-1101-1008-E1", "DIS-1101-1009-E1", 
"DIS-1101-1010-E1", "DIS-1101-1011-E1", "DIS-1101-1012-E1", "DIS-1101-1013-E1", 
"DIS-1101-1014-E1", "DIS-1101-1015-E1", "DIS-1101-1016-E1", "DIS-1101-1017-E1", 
"DIS-1101-1018-E1", "DIS-1101-1019-E1", "DIS-1101-1020-E1", "DIS-1101-1021-E1", 
"DIS-1101-1022-E1", "DIS-1101-1023-E1", "DIS-1101-1025-E1", "DIS-1101-1026-E1", 
"DIS-1101-1027-E1", "DIS-1101-1028-E1", "DIS-1101-1029-E1", "DIS-1101-1030-E1", 
"DIS-1101-1031-E1", "DIS-1101-1032-E1", "DIS-1101-1033-E1", "DIS-1101-1034-E1", 
"DIS-1101-1035-E1", "DIS-1101-1036-E1", "DIS-1101-1037-E1", "DIS-1101-1038-E1", 
"DIS-1101-1039-E1", "DIS-1101-1040-E1", "DIS-1101-1041-E1", "DIS-1101-1042-E1", 
"DIS-1101-1042-E2", "DIS-1101-1043-E1", "DIS-1101-1044-E1", "DIS-1101-1045-E1", 
"DIS-1101-1046-E1", "DIS-1101-1047-E1", "DIS-1101-1048-E1", "DIS-1101-1049-E1"
), `TIME_POINT_1=HAEMOGLOBIN` = c("BASELINE=123.00;FIRST=117.00", 
"FIRST=92.00", "FIRST=111.00", "BASELINE=125.00;FIRST=113.00", 
"FIRST=124.00", "NULL=NULL", "NULL=NULL", "FIRST=77.00", "FIRST=132.00", 
"FIRST=123.00", "BASELINE=119.00;FIRST=93.00", "BASELINE=163.00;FIRST=141.00", 
"FIRST=137.00", "NULL=NULL", "NULL=NULL", "FIRST=137.00", "FIRST=127.00", 
"BASELINE=NULL;FIRST=101.00;SECOND=105.00", "FIRST=120.00", "FIRST=145.00", 
"FIRST=126.00", "BASELINE=93.00;FIRST=82.00", "BASELINE=114.00;FIRST=95.00", 
"BASELINE=137.00;FIRST=125.00", "BASELINE=157.00;FIRST=112.00", 
"NULL=NULL", "FIRST=127.00", "FIRST=147.00", "BASELINE=80.00;FIRST=92.00", 
"BASELINE=129.00;FIRST=131.00", "FIRST=108.00", "BASELINE=92.00;FIRST=80.00", 
"BASELINE=118.00;FIRST=107.00", "NULL=NULL", "NULL=NULL", "BASELINE=NULL;FIRST=89.00", 
"NULL=NULL", "BASELINE=NULL;FIRST=101.00", "FIRST=151.00", "FIRST=106.00", 
"BASELINE=81.00;FIRST=87.00", "FIRST=94.00;SECOND=85.00", "FIRST=85.00", 
"FIRST=118.00;SECOND=116.00", "FIRST=101.00;SECOND=103.00", "FIRST=114.00", 
"BASELINE=NULL;FIRST=129.00", "BASELINE=123.00;FIRST=105.00", 
"NULL=NULL", "NULL=NULL"), `TIME_POINT_2=WHITE_CELLS` = c("BASELINE=2.90;FIRST=2.40", 
"FIRST=14.80", "FIRST=11.00", "BASELINE=6.90;FIRST=7.30", "FIRST=4.50", 
"NULL=NULL", "NULL=NULL", "FIRST=6.10", "FIRST=7.70", "FIRST=16.20", 
"BASELINE=7.40;FIRST=5.30", "BASELINE=12.80;FIRST=10.30", "FIRST=14.50", 
"NULL=NULL", "NULL=NULL", "FIRST=12.80", "FIRST=3.70", "BASELINE=4.20;FIRST=4.70;SECOND=6.50", 
"FIRST=16.60", "FIRST=9.30", "FIRST=15.90", "BASELINE=31.00;FIRST=31.70", 
"BASELINE=3.70;FIRST=7.00", "BASELINE=14.10;FIRST=7.20", "BASELINE=21.80;FIRST=31.20", 
"NULL=NULL", "FIRST=9.20", "FIRST=5.90", "BASELINE=2.20;FIRST=0.50", 
"BASELINE=10.50;FIRST=10.60", "FIRST=12.00", "BASELINE=14.70;FIRST=11.80", 
"BASELINE=13.40;FIRST=13.00", "NULL=NULL", "NULL=NULL", "BASELINE=NULL;FIRST=18.60", 
"NULL=NULL", "BASELINE=4.30;FIRST=5.90", "FIRST=7.60", "FIRST=3.10", 
"BASELINE=10.10;FIRST=11.40", "FIRST=12.00;SECOND=21.30", "FIRST=9.80", 
"FIRST=29.50;SECOND=4.80", "FIRST=4.60;SECOND=9.90", "FIRST=18.90", 
"BASELINE=15.00;FIRST=13.00", "BASELINE=12.80;FIRST=10.60", "NULL=NULL", 
"NULL=NULL")), row.names = c(NA, -50L), class = c("tbl_df", "tbl", 
"data.frame"))

我想要這個輸出:

dput(output)
structure(list(UNIQUE_PATIENT_ID = c("DIS-1101-1001-E1", "DIS-1101-1002-E1", 
"DIS-1101-1003-E1", "DIS-1101-1004-E1", "DIS-1101-1004-E2", "DIS-1101-1005-E1", 
"DIS-1101-1006-E1", "DIS-1101-1007-E1", "DIS-1101-1008-E1", "DIS-1101-1009-E1", 
"DIS-1101-1010-E1", "DIS-1101-1011-E1", "DIS-1101-1012-E1", "DIS-1101-1013-E1", 
"DIS-1101-1014-E1", "DIS-1101-1015-E1", "DIS-1101-1016-E1", "DIS-1101-1017-E1", 
"DIS-1101-1018-E1", "DIS-1101-1019-E1", "DIS-1101-1020-E1", "DIS-1101-1021-E1", 
"DIS-1101-1022-E1", "DIS-1101-1023-E1", "DIS-1101-1025-E1", "DIS-1101-1026-E1", 
"DIS-1101-1027-E1", "DIS-1101-1028-E1", "DIS-1101-1029-E1", "DIS-1101-1030-E1", 
"DIS-1101-1031-E1", "DIS-1101-1032-E1", "DIS-1101-1033-E1", "DIS-1101-1034-E1", 
"DIS-1101-1035-E1", "DIS-1101-1036-E1", "DIS-1101-1037-E1", "DIS-1101-1038-E1", 
"DIS-1101-1039-E1", "DIS-1101-1040-E1", "DIS-1101-1041-E1", "DIS-1101-1042-E1", 
"DIS-1101-1042-E2", "DIS-1101-1043-E1", "DIS-1101-1044-E1", "DIS-1101-1045-E1", 
"DIS-1101-1046-E1", "DIS-1101-1047-E1", "DIS-1101-1048-E1", "DIS-1101-1049-E1"
), Haemoglobin_BASELINE = c("123.00", NA, NA, "125.00", NA, NA, 
NA, NA, NA, NA, "119.00", "163.00", NA, NA, NA, NA, NA, "NULL", 
NA, NA, NA, "93.00", "114.00", "137.00", "157.00", NA, NA, NA, 
"80.00", "129.00", NA, "92.00", "118.00", NA, NA, "NULL", NA, 
"NULL", NA, NA, "81.00", NA, NA, NA, NA, NA, "NULL", "123.00", 
NA, NA), Haemoglobin_FIRST = c("117.00", "92.00", "111.00", "113.00", 
"124.00", NA, NA, "77.00", "132.00", "123.00", "93.00", "141.00", 
"137.00", NA, NA, "137.00", "127.00", "101.00", "120.00", "145.00", 
"126.00", "82.00", "95.00", "125.00", "112.00", NA, "127.00", 
"147.00", "92.00", "131.00", "108.00", "80.00", "107.00", NA, 
NA, "89.00", NA, "101.00", "151.00", "106.00", "87.00", "94.00", 
"85.00", "118.00", "101.00", "114.00", "129.00", "105.00", NA, 
NA), Haemoglobin_SECOND = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, "105.00", NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, "85.00", NA, "116.00", "103.00", NA, NA, NA, NA, NA), 
    WBC_BASELINE = c("2.90", NA, NA, "6.90", NA, NA, NA, NA, 
    NA, NA, "7.40", "12.80", NA, NA, NA, NA, NA, "4.20", NA, 
    NA, NA, "31.00", "3.70", "14.10", "21.80", NA, NA, NA, "2.20", 
    "10.50", NA, "14.70", "13.40", NA, NA, "NULL", NA, "4.30", 
    NA, NA, "10.10", NA, NA, NA, NA, NA, "15.00", "12.80", NA, 
    NA), WBC_FIRST = c("2.40", "14.80", "11.00", "7.30", "4.50", 
    NA, NA, "6.10", "7.70", "16.20", "5.30", "10.30", "14.50", 
    NA, NA, "12.80", "3.70", "4.70", "16.60", "9.30", "15.90", 
    "31.70", "7.00", "7.20", "31.20", NA, "9.20", "5.90", "0.50", 
    "10.60", "12.00", "11.80", "13.00", NA, NA, "18.60", NA, 
    "5.90", "7.60", "3.10", "11.40", "12.00", "9.80", "29.50", 
    "4.60", "18.90", "13.00", "10.60", NA, NA), WBC_SECOND = c(NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, "6.50", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "21.30", NA, 
    "4.80", "9.90", NA, NA, NA, NA, NA)), row.names = c(NA, -50L
), class = "data.frame")

我可以做到這一點,但我的代碼效率低下(因為我在每個血柱上都這樣做):

#haemoglobin
haem_ID <- df %>% select(UNIQUE_PATIENT_ID, `TIME_POINT_1=HAEMOGLOBIN`)

haem_sep <- haem_ID %>%
  rename(text = `TIME_POINT_1=HAEMOGLOBIN`) %>%
  separate_rows(text, sep = ';') %>%
  separate(text, c('text', 'value'), sep = '=') %>%
  pivot_wider(names_from = text, values_from = value)

tmp <- haem_sep %>% select(-UNIQUE_PATIENT_ID)
colnames(tmp) <- paste("Haemoglobin", colnames(tmp), sep = "_")
tmp2 <- haem_ID %>% select(UNIQUE_PATIENT_ID)
cbind(tmp2, tmp) -> tmp3
tmp3 %<>% select(-Haemoglobin_NULL)
haem <- tmp3


#white cells
a <- df %>% select(UNIQUE_PATIENT_ID, `TIME_POINT_2=WHITE_CELLS`)

b <- a %>%
  rename(text = `TIME_POINT_2=WHITE_CELLS`) %>%
  separate_rows(text, sep = ';') %>%
  separate(text, c('text', 'value'), sep = '=') %>%
  pivot_wider(names_from = text, values_from = value)

tmp <- b %>% select(-UNIQUE_PATIENT_ID)
colnames(tmp) <- paste("WBC", colnames(tmp), sep = "_")
tmp2 <- a %>% select(UNIQUE_PATIENT_ID)
cbind(tmp2, tmp) -> tmp3
tmp3 %<>% select(-WBC_NULL)
WBC <- tmp3

output <- merge(haem, WBC)

請問有人可以幫我實現同樣但更有效的嗎? 實際上,我有 20 次血液測試,而不僅僅是 2 次,所以硬編碼並不理想。

更新

根據友好的響應,我嘗試運行建議的代碼,但出現錯誤,我不知道為什么。

輸入數據

structure(list(UNIQUE_PATIENT_ID = c("DIS-1104-1021-E1", "DIS-1107-1010-E1", 
"DIS-1124-1108-E1"), `TIME_POINT_1=HAEMOGLOBIN` = c("BASELINE=98.00;FIRST=90.00;SECOND=81.00;FIRST=85.00", 
"FIRST=120.00;NULL=120.00;NULL=118.00;SECOND=109.00;NULL=186.00", 
"BASELINE=78.00;FIRST=NULL;FIRST=99.00"), `TIME_POINT_2=WHITE_CELLS` = c("BASELINE=0.01;FIRST=NULL;SECOND=0.01;FIRST=0.00", 
"FIRST=15.40;NULL=16.10;NULL=14.50;SECOND=11.40;NULL=2.20;NULL=5.00", 
"BASELINE=85.00;FIRST=NULL;FIRST=209.00"), `TIME_POINT_3=PLATELETS` = c("BASELINE=28.00;FIRST=28.00;SECOND=12.00;FIRST=21.00", 
"FIRST=200.00;NULL=186.00;NULL=203.00;SECOND=243.00;NULL=145.00;NULL=457.00", 
"BASELINE=186.00;FIRST=NULL"), `TIME_POINT_4=NEUTROPHILS` = c("BASELINE=0.00;FIRST=0.00;SECOND=0.00", 
"FIRST=13.00;NULL=13.20;NULL=11.10;SECOND=7.70;NULL=1.30;NULL=2.20", 
"BASELINE=7.10;FIRST=2.40;FIRST=NULL"), `TIME_POINT_5=LYMPHOCYTES` = c("BASELINE=0.00;FIRST=0.00;SECOND=0.00", 
"FIRST=0.80;NULL=1.60;NULL=1.40;SECOND=1.70;NULL=0.40", "BASELINE=76.80;FIRST=135.00;FIRST=NULL"
), `TIME_POINT_6=MONOCYTES` = c("BASELINE=0.00;FIRST=0.00;SECOND=0.00", 
"FIRST=1.50;NULL=1.20;NULL=1.90;SECOND=2.00;NULL=0.40;NULL=0.90", 
"BASELINE=1.40;FIRST=4.20;FIRST=NULL"), `TIME_POINT_7=EOSINOPHILS` = c("BASELINE=0.00;FIRST=0.00;SECOND=0.00", 
"FIRST=0.10;NULL=0.00;SECOND=0.00;NULL=0.10;NULL=0.30", "BASELINE=0.10;FIRST=0.40;FIRST=NULL"
), `TIME_POINT_8=FIBRINOGEN` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL", 
"FIRST=6.90;NULL=6.60;NULL=7.10;SECOND=6.60;NULL=6.00;NULL=NULL", 
"BASELINE=6.70;FIRST=NULL"), `TIME_POINT_9=PT` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL", 
"FIRST=19.20;NULL=17.20;NULL=16.10;SECOND=14.40;NULL=14.90;NULL=NULL", 
"BASELINE=13.40;FIRST=NULL"), `TIME_POINT_10=UREA` = c("BASELINE=4.30;FIRST=4.30;SECOND=5.10;FIRST=4.50", 
"FIRST=3.80;NULL=3.70;NULL=4.40;SECOND=2.80;NULL=2.90;NULL=4.10", 
"BASELINE=5.10;FIRST=NULL;FIRST=7.20"), `TIME_POINT_11=CREATININE` = c("BASELINE=67.00;FIRST=55.00;SECOND=61.00;FIRST=60.00", 
"FIRST=44.00;NULL=42.00;NULL=45.00;SECOND=42.00;NULL=33.00", 
"BASELINE=65.00;FIRST=NULL;FIRST=72.00"), `TIME_POINT_12=ALT` = c("BASELINE=9.00;FIRST=11.00;SECOND=8.00;FIRST=5.00", 
"FIRST=20.00;NULL=NULL;NULL=19.00;SECOND=16.00;NULL=16.00;NULL=21.00", 
"BASELINE=64.00;FIRST=NULL;FIRST=92.00"), `TIME_POINT_13=BILIRUBIN` = c("BASELINE=3.00;FIRST=5.00;SECOND=3.00;FIRST=NULL", 
"FIRST=20.00;NULL=NULL;NULL=11.00;SECOND=10.00;NULL=5.00", "BASELINE=7.00;FIRST=NULL;FIRST=10.00"
), `TIME_POINT_14=ALBUMIN` = c("BASELINE=42.00;FIRST=39.00;SECOND=35.00;FIRST=36.00", 
"FIRST=42.00;NULL=NULL;NULL=37.00;SECOND=35.00;NULL=35.00;NULL=39.00", 
"BASELINE=31.00;FIRST=NULL;FIRST=40.00"), `TIME_POINT_15=VITAMIN_D` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL", 
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_16=CRP` = c("BASELINE=NULL;FIRST=172.00;SECOND=116.00;FIRST=212.00", 
"FIRST=427.00;NULL=420.00;NULL=338.00;SECOND=215.00;NULL=87.00;NULL=64.00", 
"BASELINE=199.00;FIRST=NULL;FIRST=1.00"), `TIME_POINT_17=PCT` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL", 
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_18=FERRITIN` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL", 
"FIRST=NULL;NULL=NULL;NULL=378.00;SECOND=NULL", "BASELINE=809.00;FIRST=432.00;FIRST=NULL"
), `TIME_POINT_19=TROP_T` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL", 
"FIRST=NULL;NULL=NULL;NULL=3.00;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_20=D_DIMER` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL", 
"FIRST=NULL;NULL=1.85;NULL=1.65;SECOND=NULL;NULL=NULL", "BASELINE=749.00;FIRST=486.00;FIRST=NULL"
), `TIME_POINT_21=INR` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL", 
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_22=BNP_NT` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL", 
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_23=LDH` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL", 
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=254.00;FIRST=NULL"
), `TIME_POINT_24=CK` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL", 
"FIRST=NULL;NULL=NULL;NULL=29.00;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_25=SCD_25` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL", 
"FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_26=BASE_EXCESS` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL", 
"FIRST=3.10;NULL=1.50;NULL=NULL;SECOND=NULL", "BASELINE=-3.80;FIRST=NULL"
), `TIME_POINT_27=LACTATE` = c("BASELINE=NULL;FIRST=NULL;SECOND=NULL", 
"FIRST=0.90;NULL=1.30;NULL=NULL;SECOND=NULL", "BASELINE=0.60;FIRST=NULL"
)), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"
))

代碼:

output <- input |> 
  separate_rows(-UNIQUE_PATIENT_ID, sep = ";") |> 
  pivot_longer(-UNIQUE_PATIENT_ID) |> 
  separate(value, into = c("timepoint", "value"), sep = "=") |> 
  mutate(name = str_remove(name, ".*=")) |> 
  pivot_wider(names_from = c("name", "timepoint")) |> 
  select(-ends_with("NULL"))

錯誤信息:

Error in `fn()`:
! In row 2, can't recycle input of size 5 to size 6.
Run `rlang::last_error()` to see where the error occurred.

謝謝!

最后一個select行是可選的,具體取決於您希望列的排序方式以及您是否希望保留 NULL 列:

df |> 
  separate_rows(-UNIQUE_PATIENT_ID, sep = ";") |> 
  pivot_longer(-UNIQUE_PATIENT_ID) |> 
  separate(value, into = c("timepoint", "value"), sep = "=") |> 
  mutate(name = str_remove(name, ".*=")) |> 
  pivot_wider(names_from = c("name", "timepoint")) |> 
  select(UNIQUE_PATIENT_ID, sort(tidyselect::peek_vars()), -contains("NULL"))

使用帶有語法列名稱的示例數據,即使用“。” 而不是“=”),可以在干凈的 R 會話中運行以下表示:

library(tidyverse)

df <- tribble(
  ~UNIQUE_PATIENT_ID,                  ~TIME_POINT_1.HAEMOGLOBIN,              ~TIME_POINT_2.WHITE_CELLS,
  "DIS-1101-1001-E1",             "BASELINE=123.00;FIRST=117.00",             "BASELINE=2.90;FIRST=2.40",
  "DIS-1101-1002-E1",                              "FIRST=92.00",                          "FIRST=14.80",
  "DIS-1101-1003-E1",                             "FIRST=111.00",                          "FIRST=11.00",
  "DIS-1101-1004-E1",             "BASELINE=125.00;FIRST=113.00",             "BASELINE=6.90;FIRST=7.30",
  "DIS-1101-1004-E2",                             "FIRST=124.00",                           "FIRST=4.50",
  "DIS-1101-1005-E1",                                "NULL=NULL",                            "NULL=NULL",
  "DIS-1101-1006-E1",                                "NULL=NULL",                            "NULL=NULL",
  "DIS-1101-1007-E1",                              "FIRST=77.00",                           "FIRST=6.10",
  "DIS-1101-1008-E1",                             "FIRST=132.00",                           "FIRST=7.70",
  "DIS-1101-1009-E1",                             "FIRST=123.00",                          "FIRST=16.20",
  "DIS-1101-1010-E1",              "BASELINE=119.00;FIRST=93.00",             "BASELINE=7.40;FIRST=5.30",
  "DIS-1101-1011-E1",             "BASELINE=163.00;FIRST=141.00",           "BASELINE=12.80;FIRST=10.30",
  "DIS-1101-1012-E1",                             "FIRST=137.00",                          "FIRST=14.50",
  "DIS-1101-1013-E1",                                "NULL=NULL",                            "NULL=NULL",
  "DIS-1101-1014-E1",                                "NULL=NULL",                            "NULL=NULL",
  "DIS-1101-1015-E1",                             "FIRST=137.00",                          "FIRST=12.80",
  "DIS-1101-1016-E1",                             "FIRST=127.00",                           "FIRST=3.70",
  "DIS-1101-1017-E1", "BASELINE=NULL;FIRST=101.00;SECOND=105.00", "BASELINE=4.20;FIRST=4.70;SECOND=6.50",
  "DIS-1101-1018-E1",                             "FIRST=120.00",                          "FIRST=16.60",
  "DIS-1101-1019-E1",                             "FIRST=145.00",                           "FIRST=9.30",
  "DIS-1101-1020-E1",                             "FIRST=126.00",                          "FIRST=15.90",
  "DIS-1101-1021-E1",               "BASELINE=93.00;FIRST=82.00",           "BASELINE=31.00;FIRST=31.70",
  "DIS-1101-1022-E1",              "BASELINE=114.00;FIRST=95.00",             "BASELINE=3.70;FIRST=7.00",
  "DIS-1101-1023-E1",             "BASELINE=137.00;FIRST=125.00",            "BASELINE=14.10;FIRST=7.20",
  "DIS-1101-1025-E1",             "BASELINE=157.00;FIRST=112.00",           "BASELINE=21.80;FIRST=31.20",
  "DIS-1101-1026-E1",                                "NULL=NULL",                            "NULL=NULL",
  "DIS-1101-1027-E1",                             "FIRST=127.00",                           "FIRST=9.20",
  "DIS-1101-1028-E1",                             "FIRST=147.00",                           "FIRST=5.90",
  "DIS-1101-1029-E1",               "BASELINE=80.00;FIRST=92.00",             "BASELINE=2.20;FIRST=0.50",
  "DIS-1101-1030-E1",             "BASELINE=129.00;FIRST=131.00",           "BASELINE=10.50;FIRST=10.60",
  "DIS-1101-1031-E1",                             "FIRST=108.00",                          "FIRST=12.00",
  "DIS-1101-1032-E1",               "BASELINE=92.00;FIRST=80.00",           "BASELINE=14.70;FIRST=11.80",
  "DIS-1101-1033-E1",             "BASELINE=118.00;FIRST=107.00",           "BASELINE=13.40;FIRST=13.00",
  "DIS-1101-1034-E1",                                "NULL=NULL",                            "NULL=NULL",
  "DIS-1101-1035-E1",                                "NULL=NULL",                            "NULL=NULL",
  "DIS-1101-1036-E1",                "BASELINE=NULL;FIRST=89.00",            "BASELINE=NULL;FIRST=18.60",
  "DIS-1101-1037-E1",                                "NULL=NULL",                            "NULL=NULL",
  "DIS-1101-1038-E1",               "BASELINE=NULL;FIRST=101.00",             "BASELINE=4.30;FIRST=5.90",
  "DIS-1101-1039-E1",                             "FIRST=151.00",                           "FIRST=7.60",
  "DIS-1101-1040-E1",                             "FIRST=106.00",                           "FIRST=3.10",
  "DIS-1101-1041-E1",               "BASELINE=81.00;FIRST=87.00",           "BASELINE=10.10;FIRST=11.40",
  "DIS-1101-1042-E1",                 "FIRST=94.00;SECOND=85.00",             "FIRST=12.00;SECOND=21.30",
  "DIS-1101-1042-E2",                              "FIRST=85.00",                           "FIRST=9.80",
  "DIS-1101-1043-E1",               "FIRST=118.00;SECOND=116.00",              "FIRST=29.50;SECOND=4.80",
  "DIS-1101-1044-E1",               "FIRST=101.00;SECOND=103.00",               "FIRST=4.60;SECOND=9.90",
  "DIS-1101-1045-E1",                             "FIRST=114.00",                          "FIRST=18.90",
  "DIS-1101-1046-E1",               "BASELINE=NULL;FIRST=129.00",           "BASELINE=15.00;FIRST=13.00",
  "DIS-1101-1047-E1",             "BASELINE=123.00;FIRST=105.00",           "BASELINE=12.80;FIRST=10.60",
  "DIS-1101-1048-E1",                                "NULL=NULL",                            "NULL=NULL",
  "DIS-1101-1049-E1",                                "NULL=NULL",                            "NULL=NULL"
  )

df |> 
  separate_rows(-UNIQUE_PATIENT_ID, sep = ";") |> 
  pivot_longer(-UNIQUE_PATIENT_ID) |> 
  separate(value, into = c("timepoint", "value"), sep = "=") |> 
  mutate(name = str_remove(name, ".*\\.")) |> 
  pivot_wider(names_from = c("name", "timepoint")) |> 
  select(UNIQUE_PATIENT_ID, sort(tidyselect::peek_vars()), -contains("NULL"))

#> # A tibble: 50 × 7
#>    UNIQUE_PATIENT_ID HAEMOGLOBIN_BASELINE HAEMOGLOBIN_FIRST HAEMOGLOBIN_SECOND
#>    <chr>             <chr>                <chr>             <chr>             
#>  1 DIS-1101-1001-E1  123.00               117.00            <NA>              
#>  2 DIS-1101-1002-E1  <NA>                 92.00             <NA>              
#>  3 DIS-1101-1003-E1  <NA>                 111.00            <NA>              
#>  4 DIS-1101-1004-E1  125.00               113.00            <NA>              
#>  5 DIS-1101-1004-E2  <NA>                 124.00            <NA>              
#>  6 DIS-1101-1005-E1  <NA>                 <NA>              <NA>              
#>  7 DIS-1101-1006-E1  <NA>                 <NA>              <NA>              
#>  8 DIS-1101-1007-E1  <NA>                 77.00             <NA>              
#>  9 DIS-1101-1008-E1  <NA>                 132.00            <NA>              
#> 10 DIS-1101-1009-E1  <NA>                 123.00            <NA>              
#> # … with 40 more rows, and 3 more variables: WHITE_CELLS_BASELINE <chr>,
#> #   WHITE_CELLS_FIRST <chr>, WHITE_CELLS_SECOND <chr>

reprex 包創建於 2022-07-04 (v2.0.1)

** 更新您更廣泛的 SAMPLE2 數據集 **

為了處理跨列的不同數量的分隔符,我建議使用一個函數來逐列迭代地separate_rows行。 這克服了在樣本數據集中不是問題的更廣泛數據集中的separate_rows行錯誤。

使用模式匹配,我還只提取了引用“BASELINE”、“FIRST”或“SECOND”的字符串,這些字符串與一個值相關聯。

正如@harre 所指出的,由於某些列包含多個,例如 FIRST,我隨后在使用pivot_wider之前使用slice_head獲取了其中的第一個。 您可以改為使用slice_tail取最后一個或取平均值等,具體取決於要求。

您需要在擴展數據集上進一步測試,但希望這會有所幫助!

library(tidyverse)

df <- structure(list(UNIQUE_PATIENT_ID = c(
  "DIS-1104-1021-E1", "DIS-1107-1010-E1",
  "DIS-1124-1108-E1"
), `TIME_POINT_1=HAEMOGLOBIN` = c(
  "BASELINE=98.00;FIRST=90.00;SECOND=81.00;FIRST=85.00",
  "FIRST=120.00;NULL=120.00;NULL=118.00;SECOND=109.00;NULL=186.00",
  "BASELINE=78.00;FIRST=NULL;FIRST=99.00"
), `TIME_POINT_2=WHITE_CELLS` = c(
  "BASELINE=0.01;FIRST=NULL;SECOND=0.01;FIRST=0.00",
  "FIRST=15.40;NULL=16.10;NULL=14.50;SECOND=11.40;NULL=2.20;NULL=5.00",
  "BASELINE=85.00;FIRST=NULL;FIRST=209.00"
), `TIME_POINT_3=PLATELETS` = c(
  "BASELINE=28.00;FIRST=28.00;SECOND=12.00;FIRST=21.00",
  "FIRST=200.00;NULL=186.00;NULL=203.00;SECOND=243.00;NULL=145.00;NULL=457.00",
  "BASELINE=186.00;FIRST=NULL"
), `TIME_POINT_4=NEUTROPHILS` = c(
  "BASELINE=0.00;FIRST=0.00;SECOND=0.00",
  "FIRST=13.00;NULL=13.20;NULL=11.10;SECOND=7.70;NULL=1.30;NULL=2.20",
  "BASELINE=7.10;FIRST=2.40;FIRST=NULL"
), `TIME_POINT_5=LYMPHOCYTES` = c(
  "BASELINE=0.00;FIRST=0.00;SECOND=0.00",
  "FIRST=0.80;NULL=1.60;NULL=1.40;SECOND=1.70;NULL=0.40", "BASELINE=76.80;FIRST=135.00;FIRST=NULL"
), `TIME_POINT_6=MONOCYTES` = c(
  "BASELINE=0.00;FIRST=0.00;SECOND=0.00",
  "FIRST=1.50;NULL=1.20;NULL=1.90;SECOND=2.00;NULL=0.40;NULL=0.90",
  "BASELINE=1.40;FIRST=4.20;FIRST=NULL"
), `TIME_POINT_7=EOSINOPHILS` = c(
  "BASELINE=0.00;FIRST=0.00;SECOND=0.00",
  "FIRST=0.10;NULL=0.00;SECOND=0.00;NULL=0.10;NULL=0.30", "BASELINE=0.10;FIRST=0.40;FIRST=NULL"
), `TIME_POINT_8=FIBRINOGEN` = c(
  "BASELINE=NULL;FIRST=NULL;SECOND=NULL",
  "FIRST=6.90;NULL=6.60;NULL=7.10;SECOND=6.60;NULL=6.00;NULL=NULL",
  "BASELINE=6.70;FIRST=NULL"
), `TIME_POINT_9=PT` = c(
  "BASELINE=NULL;FIRST=NULL;SECOND=NULL",
  "FIRST=19.20;NULL=17.20;NULL=16.10;SECOND=14.40;NULL=14.90;NULL=NULL",
  "BASELINE=13.40;FIRST=NULL"
), `TIME_POINT_10=UREA` = c(
  "BASELINE=4.30;FIRST=4.30;SECOND=5.10;FIRST=4.50",
  "FIRST=3.80;NULL=3.70;NULL=4.40;SECOND=2.80;NULL=2.90;NULL=4.10",
  "BASELINE=5.10;FIRST=NULL;FIRST=7.20"
), `TIME_POINT_11=CREATININE` = c(
  "BASELINE=67.00;FIRST=55.00;SECOND=61.00;FIRST=60.00",
  "FIRST=44.00;NULL=42.00;NULL=45.00;SECOND=42.00;NULL=33.00",
  "BASELINE=65.00;FIRST=NULL;FIRST=72.00"
), `TIME_POINT_12=ALT` = c(
  "BASELINE=9.00;FIRST=11.00;SECOND=8.00;FIRST=5.00",
  "FIRST=20.00;NULL=NULL;NULL=19.00;SECOND=16.00;NULL=16.00;NULL=21.00",
  "BASELINE=64.00;FIRST=NULL;FIRST=92.00"
), `TIME_POINT_13=BILIRUBIN` = c(
  "BASELINE=3.00;FIRST=5.00;SECOND=3.00;FIRST=NULL",
  "FIRST=20.00;NULL=NULL;NULL=11.00;SECOND=10.00;NULL=5.00", "BASELINE=7.00;FIRST=NULL;FIRST=10.00"
), `TIME_POINT_14=ALBUMIN` = c(
  "BASELINE=42.00;FIRST=39.00;SECOND=35.00;FIRST=36.00",
  "FIRST=42.00;NULL=NULL;NULL=37.00;SECOND=35.00;NULL=35.00;NULL=39.00",
  "BASELINE=31.00;FIRST=NULL;FIRST=40.00"
), `TIME_POINT_15=VITAMIN_D` = c(
  "BASELINE=NULL;FIRST=NULL;SECOND=NULL",
  "FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_16=CRP` = c(
  "BASELINE=NULL;FIRST=172.00;SECOND=116.00;FIRST=212.00",
  "FIRST=427.00;NULL=420.00;NULL=338.00;SECOND=215.00;NULL=87.00;NULL=64.00",
  "BASELINE=199.00;FIRST=NULL;FIRST=1.00"
), `TIME_POINT_17=PCT` = c(
  "BASELINE=NULL;FIRST=NULL;SECOND=NULL",
  "FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_18=FERRITIN` = c(
  "BASELINE=NULL;FIRST=NULL;SECOND=NULL",
  "FIRST=NULL;NULL=NULL;NULL=378.00;SECOND=NULL", "BASELINE=809.00;FIRST=432.00;FIRST=NULL"
), `TIME_POINT_19=TROP_T` = c(
  "BASELINE=NULL;FIRST=NULL;SECOND=NULL",
  "FIRST=NULL;NULL=NULL;NULL=3.00;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_20=D_DIMER` = c(
  "BASELINE=NULL;FIRST=NULL;SECOND=NULL",
  "FIRST=NULL;NULL=1.85;NULL=1.65;SECOND=NULL;NULL=NULL", "BASELINE=749.00;FIRST=486.00;FIRST=NULL"
), `TIME_POINT_21=INR` = c(
  "BASELINE=NULL;FIRST=NULL;SECOND=NULL",
  "FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_22=BNP_NT` = c(
  "BASELINE=NULL;FIRST=NULL;SECOND=NULL",
  "FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_23=LDH` = c(
  "BASELINE=NULL;FIRST=NULL;SECOND=NULL",
  "FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=254.00;FIRST=NULL"
), `TIME_POINT_24=CK` = c(
  "BASELINE=NULL;FIRST=NULL;SECOND=NULL",
  "FIRST=NULL;NULL=NULL;NULL=29.00;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_25=SCD_25` = c(
  "BASELINE=NULL;FIRST=NULL;SECOND=NULL",
  "FIRST=NULL;NULL=NULL;SECOND=NULL", "BASELINE=NULL;FIRST=NULL"
), `TIME_POINT_26=BASE_EXCESS` = c(
  "BASELINE=NULL;FIRST=NULL;SECOND=NULL",
  "FIRST=3.10;NULL=1.50;NULL=NULL;SECOND=NULL", "BASELINE=-3.80;FIRST=NULL"
), `TIME_POINT_27=LACTATE` = c(
  "BASELINE=NULL;FIRST=NULL;SECOND=NULL",
  "FIRST=0.90;NULL=1.30;NULL=NULL;SECOND=NULL", "BASELINE=0.60;FIRST=NULL"
)), row.names = c(NA, -3L), class = c("tbl_df", "tbl", "data.frame"))



pattern <- c("FIRST=\\d{2,3}.\\d{0,2}|SECOND=\\d{2,3}.\\d{0,2}|BASELINE=\\d{2,3}.\\d{0,2}")

# Function to iteratively separate by column
separate_iteratively <- function(col){
  separate_rows(df2, col, sep = ";") |> 
    select(1, col)
}

# Extract only if BASELINE, FIRST or SECOND with an associate number
df2 <- df |> 
  mutate(across(-UNIQUE_PATIENT_ID, ~ str_extract_all(., pattern) |> 
                  map_chr(~str_c(., collapse = ";"))))

# Separate & rearrange
df3 <- map_dfr(2:ncol(df2), separate_iteratively) |> 
  pivot_longer(-UNIQUE_PATIENT_ID) |> 
  filter(!is.na(value) & !value == "") |> 
  separate(value, into = c("timepoint", "value"), sep = "=") |> 
  mutate(name = str_remove(name, ".*=")) |> 
  arrange(UNIQUE_PATIENT_ID, name, timepoint) |> 
  group_by(UNIQUE_PATIENT_ID, name, timepoint) |> 
  slice_head(n = 1) |> # Or use slice_tail
  pivot_wider(names_from = c("name", "timepoint"))|> 
  select(UNIQUE_PATIENT_ID, sort(tidyselect::peek_vars()))

df3
#> # A tibble: 3 × 35
#> # Groups:   UNIQUE_PATIENT_ID [3]
#>   UNIQUE_PATIENT_ID ALBUMIN_BASELINE ALBUMIN_FIRST ALBUMIN_SECOND ALT_BASELINE
#>   <chr>             <chr>            <chr>         <chr>          <chr>       
#> 1 DIS-1104-1021-E1  42.00            39.00         35.00          <NA>        
#> 2 DIS-1107-1010-E1  <NA>             42.00         35.00          <NA>        
#> 3 DIS-1124-1108-E1  31.00            40.00         <NA>           64.00       
#> # … with 30 more variables: ALT_FIRST <chr>, ALT_SECOND <chr>,
#> #   BILIRUBIN_FIRST <chr>, BILIRUBIN_SECOND <chr>, CREATININE_BASELINE <chr>,
#> #   CREATININE_FIRST <chr>, CREATININE_SECOND <chr>, CRP_BASELINE <chr>,
#> #   CRP_FIRST <chr>, CRP_SECOND <chr>, D_DIMER_BASELINE <chr>,
#> #   D_DIMER_FIRST <chr>, FERRITIN_BASELINE <chr>, FERRITIN_FIRST <chr>,
#> #   HAEMOGLOBIN_BASELINE <chr>, HAEMOGLOBIN_FIRST <chr>,
#> #   HAEMOGLOBIN_SECOND <chr>, LDH_FIRST <chr>, LYMPHOCYTES_BASELINE <chr>, …


# Just to illustrate the issue with varying separators
tribble(~v1, ~v2,
        "a;a", "b;b",
        "a;a", "b;b;b",
        ) |> 
  separate_rows(everything(), sep = ";")
#> Error in `fn()`:
#> ! In row 2, can't recycle input of size 2 to size 3.

# Works
tribble(~v1, ~v2,
        "a;a", "b;b",
        "a;a", "b;b",
) |> 
  separate_rows(everything(), sep = ";")
#> # A tibble: 4 × 2
#>   v1    v2   
#>   <chr> <chr>
#> 1 a     b    
#> 2 a     b    
#> 3 a     b    
#> 4 a     b

reprex 包創建於 2022-07-04 (v2.0.1)

基於我上面的評論和@Carl 的實現,您可以使用values_fn直接利用嵌套列表結構,這可能會使您免於編寫一些代碼行。 我正在使用dplyr中的first()來獲取第一個元素,但您也可以根據自己的喜好使用其他函數, ~ mean(as.numeric(.))來取平均值等。沒有變化的數量當我在運行數據時,我們在使用separate_rows之前使用pivot_longer時的分隔符。

df |> 
    pivot_longer(-UNIQUE_PATIENT_ID) |> 
    separate_rows(-UNIQUE_PATIENT_ID, sep = ";") |> 
    separate(value, into = c("timepoint", "value"), sep = "=") |> 
    mutate(name = str_remove(name, ".*=")) |>
    filter(!is.na(value) & value != "NULL" & timepoint != "NULL") |>
    pivot_wider(names_from = c("name", "timepoint"), values_fn = ~ first(.)) |>
    select(UNIQUE_PATIENT_ID, sort(tidyselect::peek_vars()))

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM