简体   繁体   中英

Combine data frame columns based on condition?

I have a data.frame where first row is a Score. Scores can be identical for 2+ columns. How can I elegantly combine them into one, with a sum of select rows for those columns? I did it with 3 for loops but it is terribly inefficient. Thank you in advance!

df = structure(list(`1542917` = c(21.03, 357, 140, 0, 0.15, 0.06, 
0), `1542954` = c(21.07, 353, 7, 0, 0.15, 0.06, 0), `1542904` = c(21.19, 
358, 5, 0, 0.15, 0.06, 0), `1542908` = c(21.19, 358, 6, 0, 0.15, 
0.06, 0), `1542894` = c(21.37, 358, 2, 0, 0.15, 0.06, 0), `1542895` = c(21.37, 
358, 5, 0, 0.15, 0.06, 0), `1542901` = c(21.37, 358, 77, 0, 0.15, 
0.06, 1)), .Names = c("1542917", "1542954", "1542904", "1542908", 
"1542894", "1542895", "1542901"), row.names = c("Score", "item_count", 
"market_count", "3M Post-It Notes 1 ct./pk. ", "7Up Soft Drinks 12 oz. 12 ct./pk. 3/$10.00", 
"7Up Soft Drinks 12 oz. 12 ct./pk. 3/$11.00", "Charlottesville, VA"
), class = "data.frame")

What I see:

    row.names   1542917 1542954 1542904 1542908 1542894 1542895 1542901
1   Score   21.03   21.07   21.19   21.19   21.37   21.37   21.37
2   item_count  357.00  353.00  358.00  358.00  358.00  358.00  358.00
3   market_count    140.00  7.00    5.00    6.00    2.00    5.00    77.00
4   3M Post-It Notes 1 ct./pk.  0.00    0.00    0.00    0.00    0.00    0.00    0.00
5   7Up Soft Drinks 12 oz. 12 ct./pk. 3/$10.00  0.15    0.15    0.15    0.15    0.15    0.15    0.15
6   7Up Soft Drinks 12 oz. 12 ct./pk. 3/$11.00  0.06    0.06    0.06    0.06    0.06    0.06    0.06
7   Charlottesville, VA 0.00    0.00    0.00    0.00    0.00    0.00    1.00

What I am after (row 3 summed, all others are as in first instance based on identical Score. It is guaranteed that columns with same Score have same numbers except market_count that I want to sum):

    row.names   1542917 1542954 1542904 1542894
1   Score   21.03   21.07   21.19   21.37
2   item_count  357.00  353.00  358.00  358.00
3   market_count    140.00  7.00    11.00   84.00
4   3M Post-It Notes 1 ct./pk.  0.00    0.00    0.00    0.00
5   7Up Soft Drinks 12 oz. 12 ct./pk. 3/$10.00  0.15    0.15    0.15    0.15
6   7Up Soft Drinks 12 oz. 12 ct./pk. 3/$11.00  0.06    0.06    0.06    0.06
7   Charlottesville, VA 0.00    0.00    0.00    1.00

EDIT - my clumsy solution. Problem is that I have 10k+ rows and it is very slow and inelegant anyways

Score = c(63.69, 27.31, 31.99, 25.41, 26.61, 28.35, 83.91, 22.59, 26.61, 
          21.73, 27.11, 26.99, 21.55, 26.99, 22.01, 21.93, 21.99, 24.39, 
          24.39, 25.31, 22.05, 21.55, 22.01, 22.33, 21.37, 21.37, 21.67, 
          26.13, 22.55, 27.11, 21.99, 21.37, 21.81, 20.71, 21.19, 21.87, 
          22.59, 29.61, 21.19, 27.21, 38.91, 28.81, 65.89, 28.71, 22.99, 
          39.85, 21.63, 21.03, 39.85, 29.41, 38.89, 34.87, 26.83, 30.85, 
          22.05, 28.05, 46.75, 27.31, 21.39, 21.73, 26.79, 21.55, 21.39, 
          29.17, 23.19, 21.07, 23.19, 21.73, 26.07, 22.01, 22.39, 46.47, 
          25.41, 21.39, 27.11, 21.55, 26.79, 21.87, 21.73, 21.55, 22.03, 
          22.35, 26.79, 27.31, 27.49, 27.11, 27.75, 26.13, NA)
un_score = unique(sort(Score))
print(un_score)
sum_mark = matrix(0,ncol=length(un_score),nrow=nrow(df))

for (i in 1:length(un_score)) {
  for (j in 1:ncol(df)) {
    for (k in 1:nrow(df)) {
      if (df[1,j] == un_score[i]) {
        if (k<3 | (k > 3 & k <= length(prod)+3)) sum_mark[k,i] = unique(df[k,j])
        else sum_mark[k,i] = sum_mark[k,i] + df[k,j]
      }
    }
  }
}
View(sum_mark)

Consider length(prod) = 3 in this example

using base R:

df1=setNames(df,df[1,])
df2=transform(stack(df1),i=rownames(df)[c(row(df1))],ind=sub("([.]\\d+).*","\\1",ind))
df3=aggregate(values~.,df2,function(x)sum(unique(x)))
reshape(df3,timevar = "ind",idvar = "i",dir="wide")
                                          i values.21.03 values.21.07 values.21.19 values.21.37
1                 3M Post-It Notes 1 ct./pk.          0.00         0.00         0.00         0.00
5  7Up Soft Drinks 12 oz. 12 ct./pk. 3/$10.00         0.15         0.15         0.15         0.15
9  7Up Soft Drinks 12 oz. 12 ct./pk. 3/$11.00         0.06         0.06         0.06         0.06
13                        Charlottesville, VA         0.00         0.00         0.00         1.00
17                                 item_count       357.00       353.00       358.00       358.00
21                               market_count       140.00         7.00        11.00        84.00
25                                      Score        21.03        21.07        21.19        21.37

or you can do:

xtabs(values~i+ind,df3)

                                            ind
i                                             21.03  21.07  21.19  21.37
  3M Post-It Notes 1 ct./pk.                   0.00   0.00   0.00   0.00
  7Up Soft Drinks 12 oz. 12 ct./pk. 3/$10.00   0.15   0.15   0.15   0.15
  7Up Soft Drinks 12 oz. 12 ct./pk. 3/$11.00   0.06   0.06   0.06   0.06
  Charlottesville, VA                          0.00   0.00   0.00   1.00
  item_count                                 357.00 353.00 358.00 358.00
  market_count                               140.00   7.00  11.00  84.00
  Score                                       21.03  21.07  21.19  21.37

if you need the above as a data.frame:

as.data.frame.matrix(xtabs(values~i+ind,df3))
                                            21.03  21.07  21.19  21.37
3M Post-It Notes 1 ct./pk.                   0.00   0.00   0.00   0.00
7Up Soft Drinks 12 oz. 12 ct./pk. 3/$10.00   0.15   0.15   0.15   0.15
7Up Soft Drinks 12 oz. 12 ct./pk. 3/$11.00   0.06   0.06   0.06   0.06
Charlottesville, VA                          0.00   0.00   0.00   1.00
item_count                                 357.00 353.00 358.00 358.00
market_count                               140.00   7.00  11.00  84.00
Score                                       21.03  21.07  21.19  21.37

Using data.table:

library(data.table)
dcast(setDT(melt(df))[,ind:=as.character(value[1]),by=variable][,
     c("i","variable"):=.(c(row(df)),NULL)][,sum(unique(value)),by=.(i,ind)],i~ind,value.var="V1")
No id variables; using all as measure variables
   i  21.03  21.07  21.19  21.37
1: 1  21.03  21.07  21.19  21.37
2: 2 357.00 353.00 358.00 358.00
3: 3 140.00   7.00  11.00  84.00
4: 4   0.00   0.00   0.00   0.00
5: 5   0.15   0.15   0.15   0.15
6: 6   0.06   0.06   0.06   0.06
7: 7   0.00   0.00   0.00   1.00

Please see, there is combination of transpostion, tapply and merge . I removed the third row (with market counts) to remove non-unique rows in a dataframe.

df2 <- as.data.frame(t(df))
x <- factor(df2$Score)
y <- tapply(df2$market_count, x, sum) 
ns <- names(y)
df3 <- data.frame(Score = ns, market_count_sum = y)
df4 <- unique(merge(df2[, -3], df3))
as.data.frame(t(df4))

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM