简体   繁体   中英

data frame grouping and column transforming using dplyr

I have the below df:

df<-data.frame(geokey=c("A","A","A","A","A","A","B","B","B","B","B","B"),
               upc=c("100","100","101","101","102","102","200","200","201","201",
                     "202","202"),
                endwk=c("14-07-2021","21-07-2021","14-07-2021","21-07-2021","14-07-2021","21-07-2021",
                               "14-07-2021","21-07-2021","14-07-2021","21-07-2021","14-07-2021","21-07-2021"),
                Base_units=c(2,3,1,2,4,1,1,4,2,3,3,2),
                Base_price=c(0.1,0.2,0.2,0.1,0.1,0.1,0.2,0.3,0.4,0.1,0.2,0.3),
                Incr_units=c(2,1,1,1,2,1,3,2,2,3,1,1),
                incr_price=c(0.1,0.1,0.1,0.3,0.2,0.1,0.1,0.2,0.1,0.2,0.1,0.2))
> df
   geokey upc      endwk Base_units Base_price Incr_units incr_price
1       A 100 14-07-2021          2        0.1          2        0.1
2       A 100 21-07-2021          3        0.2          1        0.1
3       A 101 14-07-2021          1        0.2          1        0.1
4       A 101 21-07-2021          2        0.1          1        0.3
5       A 102 14-07-2021          4        0.1          2        0.2
6       A 102 21-07-2021          1        0.1          1        0.1
7       B 200 14-07-2021          1        0.2          3        0.1
8       B 200 21-07-2021          4        0.3          2        0.2
9       B 201 14-07-2021          2        0.4          2        0.1
10      B 201 21-07-2021          3        0.1          3        0.2
11      B 202 14-07-2021          3        0.2          1        0.1
12      B 202 21-07-2021          2        0.3          1        0.2

expected output---> Group by geokey--upc---endwk with all vol cols to be totalled (added) and price columns to be averaged shown as below:

df_merged<-data.frame(geokey=c("A","A","B","B"),
               upc=c("upc_100_101_102","upc_100_101_102","upc_200_201_202","upc_200_201_202"),
               endwk=c("14-07-2021","21-07-2021","14-07-2021","21-07-2021"),
               Base_units_totalled=c(7,6,6,9),
               Base_price_averaged=c(0.133,0.133,0.2667,0.2333),
               Incr_units_totalled=c(5,3,3,6),
               incr_price_averaged=c(0.1333,0.1,0.1,0.2))

> df_merged
  geokey             upc      endwk Base_units_totalled Base_price_averaged Incr_units_totalled incr_price_averaged
1      A upc_100_101_102 14-07-2021                   7              0.1330                   5              0.1333
2      A upc_100_101_102 21-07-2021                   6              0.1330                   3              0.1000
3      B upc_200_201_202 14-07-2021                   6              0.2667                   3              0.1000
4      B upc_200_201_202 21-07-2021                   9              0.2333                   6              0.2000

Help will be appreciated.

I presume you want to summarize the upc column and not group by it?

library(dplyr)
group_by(geokey, endwk) %>%
summarize(upc = paste0("upc_", paste(upc, collapse = "_")),
        across(contains("units"), sum),
        across(contains("price"), mean), .groups = "drop")


# A tibble: 4 x 7
  geokey endwk      upc             Base_units Incr_units Base_price incr_price
* <chr>  <chr>      <chr>                <dbl>      <dbl>      <dbl>      <dbl>
1 A      14-07-2021 upc_100_101_102          7          5      0.133      0.133
2 A      21-07-2021 upc_100_101_102          6          3      0.133      0.167
3 B      14-07-2021 upc_200_201_202          6          6      0.267      0.1  
4 B      21-07-2021 upc_200_201_202          9          6      0.233      0.2 

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM