简体   繁体   English

R-汇总两列

[英]R - aggregate two columns

I have a data frame which looks like this 我有一个看起来像这样的数据框

 id1    id2    attr   
 ------------------
 11              a     
 11              a    
         11      a   
         11      b   
         11      c   
 22              a   
 22              a
         22      a
         22      a
 33              d
 44              e

I want it to look like this. 我希望它看起来像这样。 The id1, id2 are counts (frequency). id1,id2是计数(频率)。

id1    id2    attr   
 ------------------
 2              a     
        1       a  
        1       b
        1       c
 2              a
        2       a
 1              d
 1              e

The gaps have no values in it, so if required I can fill it with NA. 差距中没有值,因此如果需要,我可以用NA填充它。 I tried using aggregate function but could not get the desired output. 我尝试使用聚合函数,但无法获得所需的输出。 Thanx for the help. 感谢您的帮助。

This is your data 这是你的数据

dat<-structure(list(id1 = c(11L, 11L, NA, NA, NA, 22L, 22L, NA, NA, 
                        33L, 44L), id2 = c(NA, NA, 11L, 11L, 11L, NA, NA, 22L, 22L, NA, 
                                           NA), attr = structure(c(1L, 1L, 1L, 2L, 3L, 1L, 1L, 1L, 1L, 4L, 
                                                                   5L), .Label = c("a", "b", "c", "d", "e"), class = "factor")), .Names = c("id1", 
                                                                                                                                            "id2", "attr"), class = "data.frame", row.names = c(NA, -11L))

The required output isn't typical, but this seems to work using 'plyr' 所需的输出不是典型的,但这似乎可以使用'plyr'

library(plyr)

#use ddply and count to count the number of instances of each case in each id
temp<-ddply(dat, .(id1, id2), transform,
  freq = count(attr))

#only keep unique rows
temp<-unique(temp)

#need to create an id column for whether there is 11,22,33,44 in either id1 or id2
temp$id<-pmax(temp$id1, temp$id2, na.rm=TRUE)

#order the rows into desired order 
temp <- temp[order(temp$id, temp$attr),]

#use these ifelse statements to replace id1 and id2
temp$id1<-ifelse(is.na(temp$id1), NA, temp$freq.freq)
temp$id2<-ifelse(is.na(temp$id2), NA, temp$freq.freq)


#just keep variables you want
temp<-temp[c(1,2,3)]

temp



   id1 id2 attr
1    2  NA    a
7   NA   1    a
8   NA   1    b
9   NA   1    c
3    2  NA    a
10  NA   2    a
5    1  NA    d
6    1  NA    e

Using @jfreels use of tally from dplyr and dat 使用@jfreels使用dplyrdattally

library(dplyr)

dat1 <- dat%>% 
       group_by(id1,id2, attr) %>%
       tally()
dat2 <- dat %>% 
         unique() 

left_join(dat2,dat1) %>% 
 mutate(id1=ifelse(!is.na(id1), n, NA),id2=ifelse(!is.na(id2), n, NA)) %>% 
 select(-n)
#Joining by: c("id1", "id2", "attr")
 #  id1 id2 attr
#1   2  NA    a
#2  NA   1    a
#3  NA   1    b
#4  NA   1    c
#5   2  NA    a
#6  NA   2    a
#7   1  NA    d
#8   1  NA    e

This method's results are not exactly formatted how you wanted, but may be simpler to understand. 此方法的结果未按照您想要的方式精确格式化,但可能更易于理解。

# load library
library(dplyr)

# your data
dat<-structure(list(id1 = c(11L, 11L, NA, NA, NA, 22L, 22L, NA, NA,33L, 44L), id2 = c(NA, NA, 11L, 11L, 11L, NA, NA, 22L, 22L, NA,NA), attr = structure(c(1L, 1L, 1L, 2L, 3L, 1L, 1L, 1L, 1L, 4L,5L), .Label = c("a", "b", "c", "d", "e"), class = "factor")), .Names = c("id1","id2", "attr"), class = "data.frame", row.names = c(NA, -11L))

# tally counts the number of observations
dat %>% 
  group_by(id1,id2,attr) %>%
  tally

# output
Source: local data frame [8 x 4]
Groups: id1, id2

  id1 id2 attr n
1  11  NA    a 2
2  22  NA    a 2
3  33  NA    d 1
4  44  NA    e 1
5  NA  11    a 1
6  NA  11    b 1
7  NA  11    c 1
8  NA  22    a 2

Excuse my poor R code but in order to make what you wish possible, I had to do unconventional things. 请原谅我可怜的R代码,但是为了使您想要的成为可能,我不得不做一些非常规的事情。 The code is unfortunately not much scalable. 不幸的是,该代码可伸缩性不高。 It can certainly be improved but it delivers the example output. 当然可以改进它,但可以提供示例输出。 The only difference is your input values are assumed to have NAs in empty spaces. 唯一的区别是您的输入值假定在空白处具有NA。

#Concatenate each row to a single value and find the unique rows
unique.pasted<-apply(rawdata[!duplicated(rawdata),],1,paste,collapse="-")

#Concatenate each row
pasted.rows<-apply(rawdata,1,paste,collapse="-")

#Get frequencies and maintain row order
frequencies<-table(pasted.rows)[unique.pasted]

#Separate id1 and id2
id1.freq<-frequencies
id1.freq[is.na(rawdata[!duplicated(rawdata),"id1"])]<-NA
id2.freq<-frequencies
id2.freq[is.na(rawdata[!duplicated(rawdata),"id2"])]<-NA

#Obtain the final table
final.table<-data.frame(id1=id1.freq,id2=id2.freq,attr=rawdata[!duplicated(rawdata),"attr"])

#Remove row names
row.names(final.table)<-NULL

#Replace NA with empty values
final.table[is.na(final.table)]<-""
final.table

 id1 id2 attr
1   2        a
2       1    a
3       1    b
4       1    c
5   2        a
6       2    a
7   1        d
8   1        e

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM