简体   繁体   中英

R - aggregate two columns

I have a data frame which looks like this

 id1    id2    attr   
 ------------------
 11              a     
 11              a    
         11      a   
         11      b   
         11      c   
 22              a   
 22              a
         22      a
         22      a
 33              d
 44              e

I want it to look like this. The id1, id2 are counts (frequency).

id1    id2    attr   
 ------------------
 2              a     
        1       a  
        1       b
        1       c
 2              a
        2       a
 1              d
 1              e

The gaps have no values in it, so if required I can fill it with NA. I tried using aggregate function but could not get the desired output. Thanx for the help.

This is your data

dat<-structure(list(id1 = c(11L, 11L, NA, NA, NA, 22L, 22L, NA, NA, 
                        33L, 44L), id2 = c(NA, NA, 11L, 11L, 11L, NA, NA, 22L, 22L, NA, 
                                           NA), attr = structure(c(1L, 1L, 1L, 2L, 3L, 1L, 1L, 1L, 1L, 4L, 
                                                                   5L), .Label = c("a", "b", "c", "d", "e"), class = "factor")), .Names = c("id1", 
                                                                                                                                            "id2", "attr"), class = "data.frame", row.names = c(NA, -11L))

The required output isn't typical, but this seems to work using 'plyr'

library(plyr)

#use ddply and count to count the number of instances of each case in each id
temp<-ddply(dat, .(id1, id2), transform,
  freq = count(attr))

#only keep unique rows
temp<-unique(temp)

#need to create an id column for whether there is 11,22,33,44 in either id1 or id2
temp$id<-pmax(temp$id1, temp$id2, na.rm=TRUE)

#order the rows into desired order 
temp <- temp[order(temp$id, temp$attr),]

#use these ifelse statements to replace id1 and id2
temp$id1<-ifelse(is.na(temp$id1), NA, temp$freq.freq)
temp$id2<-ifelse(is.na(temp$id2), NA, temp$freq.freq)


#just keep variables you want
temp<-temp[c(1,2,3)]

temp



   id1 id2 attr
1    2  NA    a
7   NA   1    a
8   NA   1    b
9   NA   1    c
3    2  NA    a
10  NA   2    a
5    1  NA    d
6    1  NA    e

Using @jfreels use of tally from dplyr and dat

library(dplyr)

dat1 <- dat%>% 
       group_by(id1,id2, attr) %>%
       tally()
dat2 <- dat %>% 
         unique() 

left_join(dat2,dat1) %>% 
 mutate(id1=ifelse(!is.na(id1), n, NA),id2=ifelse(!is.na(id2), n, NA)) %>% 
 select(-n)
#Joining by: c("id1", "id2", "attr")
 #  id1 id2 attr
#1   2  NA    a
#2  NA   1    a
#3  NA   1    b
#4  NA   1    c
#5   2  NA    a
#6  NA   2    a
#7   1  NA    d
#8   1  NA    e

This method's results are not exactly formatted how you wanted, but may be simpler to understand.

# load library
library(dplyr)

# your data
dat<-structure(list(id1 = c(11L, 11L, NA, NA, NA, 22L, 22L, NA, NA,33L, 44L), id2 = c(NA, NA, 11L, 11L, 11L, NA, NA, 22L, 22L, NA,NA), attr = structure(c(1L, 1L, 1L, 2L, 3L, 1L, 1L, 1L, 1L, 4L,5L), .Label = c("a", "b", "c", "d", "e"), class = "factor")), .Names = c("id1","id2", "attr"), class = "data.frame", row.names = c(NA, -11L))

# tally counts the number of observations
dat %>% 
  group_by(id1,id2,attr) %>%
  tally

# output
Source: local data frame [8 x 4]
Groups: id1, id2

  id1 id2 attr n
1  11  NA    a 2
2  22  NA    a 2
3  33  NA    d 1
4  44  NA    e 1
5  NA  11    a 1
6  NA  11    b 1
7  NA  11    c 1
8  NA  22    a 2

Excuse my poor R code but in order to make what you wish possible, I had to do unconventional things. The code is unfortunately not much scalable. It can certainly be improved but it delivers the example output. The only difference is your input values are assumed to have NAs in empty spaces.

#Concatenate each row to a single value and find the unique rows
unique.pasted<-apply(rawdata[!duplicated(rawdata),],1,paste,collapse="-")

#Concatenate each row
pasted.rows<-apply(rawdata,1,paste,collapse="-")

#Get frequencies and maintain row order
frequencies<-table(pasted.rows)[unique.pasted]

#Separate id1 and id2
id1.freq<-frequencies
id1.freq[is.na(rawdata[!duplicated(rawdata),"id1"])]<-NA
id2.freq<-frequencies
id2.freq[is.na(rawdata[!duplicated(rawdata),"id2"])]<-NA

#Obtain the final table
final.table<-data.frame(id1=id1.freq,id2=id2.freq,attr=rawdata[!duplicated(rawdata),"attr"])

#Remove row names
row.names(final.table)<-NULL

#Replace NA with empty values
final.table[is.na(final.table)]<-""
final.table

 id1 id2 attr
1   2        a
2       1    a
3       1    b
4       1    c
5   2        a
6       2    a
7   1        d
8   1        e

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM