按 R 中的查找表中的值拆分列

Question

我有一張桌子，每個 hpo_term 一行，所以一個病人每個 ID 可以有很多行。

ID hpo_term
123 kidney failure
123 hand tremor
123 kidney transplant
432 hypertension
432 exotropia
432 scissor gait

我還有另外兩張表格，一張是腎臟術語，另一張是非腎臟術語，腎臟一個看起來像這樣：

kidney failure
kidney transplant
hypertension

非腎臟的看起來像這樣：

hand tremor
exotropia
scissor gait

我想要的結果是這樣的表格：

ID kidney_hpo_term                   non_kidney_hpo_term
123 kidney failure;kidney transplant hand tremor
432 hypertension                     exotropia;scissor gait

實際上有數百名患者和數百個 HPO 術語。

我可以訪問基礎 R； dplyr 但我真的不知道如何解決這個問題。

您的幫助將不勝感激。

非常感謝

編輯：

真正的 table1 有更多不相關的額外列，並且每個 ID 總是相同的，我也想導入它。 例如：

 ID hpo_term              year_of_birth  affected_relative   genome
    123 kidney failure    2000               Y                38
    123 hand tremor       2000               Y                38
    123 kidney transplant 2000               Y                38
    432 hypertension      1980               N                37
    432 exotropia         1980               N                37
    432 scissor gait      1980               N                37

Answer 1

這是一個 dplyr 解決方案：

library(dplyr)

table1 = data.frame(ID = c(123,123,123,432,432,432),
                    hpo_term = c("kidney failure","hand tremor","kidney transplant","hypertension","exotropia","scissor gait"))

kid_terms = c("kidney failure","kidney transplant","hypertension")
nonkid_terms = c("hand tremor","exotropia","scissor gait")

table1$term_type = NA
table1$term_type[table1$hpo_term %in% kid_terms] = "kidney_hpo_term"
table1$term_type[table1$hpo_term %in% nonkid_terms] = "non_kidney_hpo_term"

table2 = table1 %>% group_by(ID,term_type) %>%
  summarize(term_list=paste(hpo_term,collapse=";")) %>%
  spread(term_type,term_list)

> table2
    ID kidney_hpo_term                  non_kidney_hpo_term   
1   123 kidney failure;kidney transplant hand tremor           
2   432 hypertension                     exotropia;scissor gait

這是data.table解決方案：

library(data.table)

table1 = data.table(ID = c(123,123,123,432,432,432),
                    hpo_term = c("kidney failure","hand tremor","kidney transplant","hypertension","exotropia","scissor gait"))

kid_terms = c("kidney failure","kidney transplant","hypertension")
nonkid_terms = c("hand tremor","exotropia","scissor gait")

table1$term_type = NA
table1$term_type[table1$hpo_term %in% kid_terms] = "kidney_hpo_term"
table1$term_type[table1$hpo_term %in% nonkid_terms] = "non_kidney_hpo_term"

table2 = table1[,.(term_list=paste(hpo_term,collapse=";")),by=.(ID,term_type)]

table3 = dcast(table2, ID~term_type, value.var = "term_list")

> table3
    ID                  kidney_hpo_term    non_kidney_hpo_term
1: 123 kidney failure;kidney transplant            hand tremor
2: 432                     hypertension exotropia;scissor gait

Answer 2

library(dplyr); library(tidyr)
patients %>%
  left_join(terms) %>%
  group_by(ID, type) %>%
  summarize(ID.hpo_term = paste(ID.hpo_term, collapse = ", "), .groups = "drop") %>%
  tidyr::pivot_wider(names_from = type, values_from = ID.hpo_term)

結果

Joining, by = "ID.hpo_term"
# A tibble: 2 x 3
     ID kidney_hpo_term                   non_kidney_hpo_term    
  <dbl> <chr>                             <chr>                  
1   123 kidney failure, kidney transplant hand tremor            
2   432 hypertension                      exotropia, scissor gait

輸入數據

patients <- data.frame(
  stringsAsFactors = FALSE,
  ID = c(123, 123, 123, 432, 432, 432),
       ID.hpo_term = c("kidney failure",
                       "hand tremor","kidney transplant","hypertension",
                       "exotropia","scissor gait")
)


terms <- data.frame(
  stringsAsFactors = FALSE,
  type = rep(c("kidney_hpo_term", "non_kidney_hpo_term"), each = 3),
  ID.hpo_term = c("kidney failure", "kidney transplant",
                       "hypertension",
                       "hand tremor","exotropia","scissor gait")

Answer 3

這是使用tidyr::pivot_wider的另一種方法，使用values_fn進行匯總而不是單獨進行：

library(dplyr); library(tidyr)
pt.data %>% 
   mutate(kidney = hpo_term %in% kidney.hpo) %>%
   pivot_wider(names_from = kidney, values_from = hpo_term,
               values_fn = function(x)paste(x,collapse = ";"), values_fill = NA) %>%
   setNames(c("ID","Kidney","Non.kidney"))
## A tibble: 2 x 3
#     ID Kidney                           Non.kidney            
#  <int> <chr>                            <chr>                 
#1   123 kidney failure;kidney transplant hand tremor           
#2   432 hypertension                     exotropia;scissor gait

數據：

pt.data <- structure(list(ID = c(123L, 123L, 123L, 432L, 432L, 432L), hpo_term = c("kidney failure", "hand tremor", "kidney transplant", "hypertension", "exotropia", "scissor gait")), class = "data.frame", row.names = c(NA, -6L))
kidney.hpo <- c("kidney failure", "kidney transplant", "hypertension")

按 R 中的查找表中的值拆分列

問題描述

3 個解決方案

解決方案1
1 2021-05-06 15:07:48

解決方案2
1 2021-05-06 15:13:44

解決方案3
1 已采納 2021-05-06 15:14:02

按 R 中的查找表中的值拆分列

問題描述

3 個解決方案

解決方案1 1 2021-05-06 15:07:48

解決方案2 1 2021-05-06 15:13:44

解決方案3 1 已采納 2021-05-06 15:14:02

解決方案1
1 2021-05-06 15:07:48

解決方案2
1 2021-05-06 15:13:44

解決方案3
1 已采納 2021-05-06 15:14:02