R中組之間的平均差異

Question

對於示例數據框：

df1 <- structure(list(name = c("a", "b", "c", "d", "e", "f", "g", "h", 
"i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", 
"v", "w", "x", "y", "z", "a", "b", "c", "d", "e", "f", "g", "h", 
"i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", 
"v", "w", "x", "y", "z", "a", "b", "c", "d", "e", "f", "g", "h", 
"i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", 
"v", "w", "x", "y", "z"), amount = c(5.5, 5.4, 5.2, 5.3, 5.1, 
5.1, 5, 5, 4.9, 4.5, 6, 5.9, 5.7, 5.4, 5.3, 5.1, 5.6, 5.4, 5.3, 
5.6, 4.6, 4.2, 4.5, 4.2, 4, 3.8, 6, 5.8, 5.7, 5.6, 5.3, 5.6, 
5.4, 5.5, 5.4, 5.1, 9, 8.8, 8.6, 8.4, 8.2, 8, 7.8, 7.6, 7.4, 
7.2, 6, 5.75, 5.5, 5.25, 5, 4.75, 10, 8.9, 7.8, 6.7, 5.6, 4.5, 
3.4, 2.3, 1.2, 0.1, 6, 5.8, 5.7, 5.6, 5.5, 5.5, 5.4, 5.6, 5.8, 
5.1, 6, 5.5, 5.4, 5.3, 5.2, 5.1), decile = c(1L, 2L, 3L, 4L, 
5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 
10L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 
9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 
4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 2L, 3L, 4L, 5L, 6L), time = c(2016L, 
2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 
2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 2016L, 
2016L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 
2018L, 2018L, 2018L, 2018L, 2018L)), .Names = c("name", "amount", 
"decile", "time"), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-78L), spec = structure(list(cols = structure(list(name = structure(list(), class = c("collector_character", 
"collector")), amount = structure(list(), class = c("collector_double", 
"collector")), decile = structure(list(), class = c("collector_integer", 
"collector")), time = structure(list(), class = c("collector_integer", 
"collector"))), .Names = c("name", "amount", "decile", "time"
)), default = structure(list(), class = c("collector_guess", 
"collector"))), .Names = c("cols", "default"), class = "col_spec"))

我想生成一個匯總表，詳細說明十分位組1和2之間的平均差異（即，十分位1的平均結果減去十分位2組的平均結果），2和3、3和4、4和5、5和6、6以及每年的7、7和8、8和9以及9和10。

有沒有人有什么建議？

Answer 1

使用dplyr ：

df1 %>%
 group_by(decile, time) %>% #Grouping by decile and year
 summarise(res = mean(amount, na.rm = TRUE)) %>% #Calculating the means
 arrange(time, decile) %>% #Arranging according years
 group_by(time) %>% #Grouping by years
 mutate(res = res - lead(res)) #Calculating the differences between deciles

     decile  time      res
    <int> <int>    <dbl>
 1      1  2016   0.1000
 2      2  2016   0.200 
 3      3  2016   0.100 
 4      4  2016   0.150 
 5      5  2016   0.1000
 6      6  2016  -0.200 
 7      7  2016   0.1000
 8      8  2016   0.100 
 9      9  2016   0.0500
10     10  2016  NA     
11      1  2017   0.263

Answer 2

您也可以這樣做：

library(tidyverse)

for (i in 1:9) {

  df1 <- df1 %>% 
    group_by(time) %>%
    mutate_(
      .dots = setNames(list(
        paste0("mean(amount[decile ==", i, "], na.rm = TRUE) - mean(amount[decile == ", i ,"+ 1], na.rm = TRUE)")), 
        paste0("mean_", i, "_", i + 1))
    )

}

輸出為：

# A tibble: 78 x 13
# Groups:   time [3]
   name  amount decile  time mean_1_2 mean_2_3 mean_3_4 mean_4_5 mean_5_6 mean_6_7 mean_7_8 mean_8_9 mean_9_10
   <chr>  <dbl>  <int> <int>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>     <dbl>
 1 a        5.5      1  2016   0.1000      0.2      0.1     0.15   0.1000     -0.2   0.1000      0.1    0.0500
 2 b        5.4      2  2016   0.1000      0.2      0.1     0.15   0.1000     -0.2   0.1000      0.1    0.0500
 3 c        5.2      3  2016   0.1000      0.2      0.1     0.15   0.1000     -0.2   0.1000      0.1    0.0500
 4 d        5.3      4  2016   0.1000      0.2      0.1     0.15   0.1000     -0.2   0.1000      0.1    0.0500
 5 e        5.1      5  2016   0.1000      0.2      0.1     0.15   0.1000     -0.2   0.1000      0.1    0.0500
 6 f        5.1      6  2016   0.1000      0.2      0.1     0.15   0.1000     -0.2   0.1000      0.1    0.0500
 7 g        5        7  2016   0.1000      0.2      0.1     0.15   0.1000     -0.2   0.1000      0.1    0.0500
 8 h        5        8  2016   0.1000      0.2      0.1     0.15   0.1000     -0.2   0.1000      0.1    0.0500
 9 i        4.9      9  2016   0.1000      0.2      0.1     0.15   0.1000     -0.2   0.1000      0.1    0.0500
10 j        4.5     10  2016   0.1000      0.2      0.1     0.15   0.1000     -0.2   0.1000      0.1    0.0500
# ... with 68 more rows

您可以每年獲取完整的摘要，如下所示：

df1 <- df1 %>% ungroup() %>%
  select(time, starts_with("mean")) %>%
  distinct()

輸出：

# A tibble: 3 x 10
   time mean_1_2 mean_2_3 mean_3_4 mean_4_5 mean_5_6 mean_6_7 mean_7_8 mean_8_9 mean_9_10
  <int>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>    <dbl>     <dbl>
1  2016   0.1000   0.2       0.1      0.15    0.1000   -0.2     0.1000    0.1      0.0500
2  2017   0.263    0.0625    0.213    0.237   0.0875   -1.06    0.0500    0.150    0.25  
3  2018   0.600    0.433     0.433    0.433   0.4       0.633   0.45      0.450    0.9

R中組之間的平均差異

問題描述

2 個解決方案

解決方案1
5 已采納 2018-12-03 11:47:47

解決方案2
1 2018-12-03 12:15:24

R中組之間的平均差異

問題描述

2 個解決方案

解決方案1 5 已采納 2018-12-03 11:47:47

解決方案2 1 2018-12-03 12:15:24

解決方案1
5 已采納 2018-12-03 11:47:47

解決方案2
1 2018-12-03 12:15:24