簡體   English   中英

多列組的ifelse格式R

[英]ifelse formatting for groups of multiple columns R

我的df包含一系列具有相似名稱的列,每三列進行分組,類似於:

>df<-data.frame(c(0,1,4,5),c(0,1,3,3),c(0,1,1,1),c(0,1,1,1),c(0,1,1,1),c(0,1,1,1),c(0,8,1,9),c(6,1,1,1),c(5,1,3,4))

 >names(df)<-c("AA1","AA2","AA3","BB1","BB2","BB3","CC1","CC2","CC3")

> df

AA1 AA2 AA3 BB1 BB2 BB3 CC1 CC2 CC3

1 0 0 0 0 0 0 0 3 3

2 1 1 1 1 1 1 8 1 1

3 4 6 1 1 1 1 1 1 3

4 5 5 1 1 1 1 9 1 4

這基本上顯示了4位患者每種檢查類型(AA,BB,CC)的3種不同測量值(1,2,3)。 實際上,我有一個龐大的數據集,其中包含3種測量值,可對2,000名患者進行10種不同的檢查。 我想添加一個新的疾病分類列:如果每次檢查至少一項測量的得分(XX1,XX2,XX2,其中XX = AA或BB或CC)的得分> 4,則患者患有該疾病。 因此,新數據集將如下所示:

>

AA1 AA2 AA3 BB1 BB2 BB3 CC1 CC2 CC3疾病

1 0 0 0 0 0 0 0 3 3 0

2 1 1 1 1 1 1 8 1 1 1

3 4 6 1 1 1 1 1 1 3 1

4 5 5 1 1 1 1 9 1 4 1

df <- structure(list(AA1 = c(0, 1, 4, 5), AA2 = c(0, 1, 3, 3), AA3 = c(0, 
1, 1, 1), BB1 = c(0, 1, 1, 1), BB2 = c(0, 1, 1, 1), BB3 = c(0, 
1, 1, 1), CC1 = c(0, 8, 1, 9), CC2 = c(3, 1, 1, 1), CC3 = c(3, 
1, 3, 4)), .Names = c("AA1", "AA2", "AA3", "BB1", "BB2", "BB3", 
"CC1", "CC2", "CC3"), row.names = c(NA, -4L), class = "data.frame")

indx <- gsub("\\d+","", colnames(df)) #deletes the numberrs from colnames
lst <- split(seq_len(ncol(df)), indx) # seq_len(ncol(df)) #gives the sequence of column and split it by indx

 lapply(lst, function(i) df[,i]) #subset the data by column number

 lapply(lst, function(i) do.call(`pmax`, df[,i])>=4) #gives maximum value for each row and we test if that values is `>=4`

 rowSums(sapply(lst, function(i) do.call(`pmax`, df[,i])>=4))
 #[1] 0 1 1 2
 !!rowSums(sapply(lst, function(i) do.call(`pmax`, df[,i])>=4)) #double negation 
 #[1] FALSE  TRUE  TRUE  TRUE
 (!!rowSums(sapply(lst, function(i) do.call(`pmax`, df[,i])>=4))) +0
 #[1] 0 1 1 1

一次放置以上代碼

df$DISEASE <- (!!rowSums(sapply(split(seq_len(ncol(df)),
                  gsub("\\d+","", colnames(df))), function(i)
                    ( do.call(`pmax`, df[,i])>=4))))+0


 df
#   AA1 AA2 AA3 BB1 BB2 BB3 CC1 CC2 CC3 DISEASE
# 1   0   0   0   0   0   0   0   3   3       0
# 2   1   1   1   1   1   1   8   1   1       1
# 3   4   3   1   1   1   1   1   1   3       1
# 4   5   3   1   1   1   1   9   1   4       1

要么

colnames(df) <- gsub("([A-Za-z]+)(\\d+)", "\\1_\\2", colnames(df)) #created `-` between alphabets and numbers for the colnames
df$id <- 1:nrow(df) #created an id column

df1 <- reshape(df, idvar="id", varying=grep("[A-Z]", colnames(df)), direction="long", sep="_") #used reshape to get the columns starting with `AA` in one column, similarly for `BB` and `CC`

#Split by the reshaped dataset by id and look for any values that are >=4 in each list element
df$DISEASE <- sapply(split(df1[,-(1:2)], df$id), function(x) any(x >=4)) +0 
df[,-10]
#   AA_1 AA_2 AA_3 BB_1 BB_2 BB_3 CC_1 CC_2 CC_3 DISEASE
# 1    0    0    0    0    0    0    0    3    3       0
# 2    1    1    1    1    1    1    8    1    1       1
# 3    4    3    1    1    1    1    1    1    3       1
# 4    5    3    1    1    1    1    9    1    4       1
df<-data.frame(c(0,1,4,5),c(0,1,3,3),c(0,1,1,1),c(0,1,1,1),c(0,1,1,1),c(0,1,1,1),c(0,8,1,9),c(6,1,1,1),c(5,1,3,4))

names(df)<-c("AA1","AA2","AA3","BB1","BB2","BB3","CC1","CC2","CC3")

數據格式的解決方案:

rowSums(df > 4) > 0
#[1]  TRUE  TRUE FALSE  TRUE

這利用了一個事實,即邏輯值在計算它們的和時會被強制為0和1。

但是通常最好使用整潔的數據:

df$id <- rownames(df)
library(reshape2)
DF <- melt(df, id.var="id")
DF$exam <- gsub("[[:digit:]+]", "", DF$variable)
DF$meas <- as.numeric(gsub("[[:alpha:]+]", "", DF$variable))

head(DF)
#  id variable value exam meas
#1  1      AA1     0   AA    1
#2  2      AA1     1   AA    1
#3  3      AA1     4   AA    1
#4  4      AA1     5   AA    1
#5  1      AA2     0   AA    2
#6  2      AA2     1   AA    2


#Is patient diseased?
library(plyr)
ddply(DF, .(id), summarize, disease = any(value > 4))
#  id disease
#1  1    TRUE
#2  2    TRUE
#3  3   FALSE
#4  4    TRUE

#Which exam was positive?
ddply(DF, .(id, exam), summarize, disease = any(value > 4))
#   id exam disease
#1   1   AA   FALSE
#2   1   BB   FALSE
#3   1   CC    TRUE
#4   2   AA   FALSE
#5   2   BB   FALSE
#6   2   CC    TRUE
#7   3   AA   FALSE
#8   3   BB   FALSE
#9   3   CC   FALSE
#10  4   AA    TRUE
#11  4   BB   FALSE
#12  4   CC    TRUE

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM