简体   繁体   中英

How can I represent one column's values using multiple columns in R where one new column is conditional?

Looking at similar questions, I could not find one that matched my need. If one does contain a solution, please share its link.

I have this dput-produced data:

structure(list(Player = c("Seth Lugo", "Jacob deGrom", "Rick Porcello", 
"David Peterson", "Michael Wacha", "Seth Lugo", "Jacob deGrom", 
"Rick Porcello", "David Peterson", "Steven Matz", "Seth Lugo", 
"Jacob deGrom", "Rick Porcello", "David Peterson", "Seth Lugo", 
"Jacob deGrom", "Rick Porcello", "Michael Wacha", "David Peterson", 
"Jacob deGrom", "Seth Lugo", "Rick Porcello", "Robert Gsellman", 
"Michael Wacha", "Ariel Jurado", "Jacob deGrom", "Rick Porcello", 
"Seth Lugo", "Robert Gsellman", "David Peterson"), Date = structure(c(1601164800, 
1601078400, 1601078400, 1600905600, 1600819200, 1600732800, 1600646400, 
1600560000, 1600473600, 1600387200, 1600300800, 1600214400, 1600128000, 
1599955200, 1599868800, 1599782400, 1599609600, 1599523200, 1599436800, 
1599350400, 1599264000, 1599177600, 1599091200, 1599004800, 1598918400, 
1598832000, 1598745600, 1598745600, 1598659200, 1598572800), tzone = "UTC", class = c("POSIXct", 
"POSIXt")), DblHdr = c(0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 2), DateStr = c("09/27/2020", 
"09/26/2020", "09/26/2020", "09/24/2020", "09/23/2020", "09/22/2020", 
"09/21/2020", "09/20/2020", "09/19/2020", "09/18/2020", "09/17/2020", 
"09/16/2020", "09/15/2020", "09/13/2020", "09/12/2020", "09/11/2020", 
"09/09/2020", "09/08/2020", "09/07/2020", "09/06/2020", "09/05/2020", 
"09/04/2020", "09/03/2020", "09/02/2020", "09/01/2020", "08/31/2020", 
"08/30/2020", "08/30/2020", "08/29/2020", "08/28/2020"), Month = c("09", 
"09", "09", "09", "09", "09", "09", "09", "09", "09", "09", "09", 
"09", "09", "09", "09", "09", "09", "09", "09", "09", "09", "09", 
"09", "09", "08", "08", "08", "08", "08"), Tm = c("NYM", "NYM", 
"NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM", 
"NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM", 
"NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM", "NYM", 
"NYM"), Opp = c("WSN", "WSN", "WSN", "WSN", "TBR", "TBR", "TBR", 
"ATL", "ATL", "ATL", "PHI", "PHI", "PHI", "TOR", "TOR", "TOR", 
"BAL", "BAL", "PHI", "PHI", "PHI", "PHI", "NYY", "BAL", "BAL", 
"MIA", "NYY", "NYY", "NYY", "NYY"), Rslt = c("L 5-15", "L 3-4", 
"L 3-5", "W 3-2", "L 5-8", "W 5-2", "L 1-2", "L 0-7", "W 7-2", 
"L 2-15", "W 10-6", "W 5-4", "L 1-4", "L 3-7", "L 2-3", "W 18-1", 
"W 7-6", "L 2-11", "L 8-9", "W 14-1", "W 5-1", "L 3-5", "W 9-7", 
"W 9-4", "L 5-9", "L 3-5", "L 7-8", "L 2-5", "L 1-2", "W 4-3"
), W_L = c("L", "L", "L", "W", "L", "W", "L", "L", "W", "L", 
"W", "W", "L", "L", "L", "W", "W", "L", "L", "W", "W", "L", "W", 
"W", "L", "L", "L", "L", "L", "W"), temp = c("L 5", "L 3", "L 3", 
"W 3", "L 5", "W 5", "L 1", "L 0", "W 7", "L 2", "W 10", "W 5", 
"L 1", "L 3", "L 2", "W 18", "W 7", "L 2", "L 8", "W 14", "W 5", 
"L 3", "W 9", "W 9", "L 5", "L 3", "L 7", "L 2", "L 1", "W 4"
), RS = c(5, 3, 3, 3, 5, 5, 1, 0, 7, 2, 10, 5, 1, 3, 2, 18, 7, 
2, 8, 14, 5, 3, 9, 9, 5, 3, 7, 2, 1, 4), RA = c(15, 4, 5, 2, 
8, 2, 2, 7, 2, 15, 6, 4, 4, 7, 3, 1, 6, 11, 9, 1, 1, 5, 7, 4, 
9, 5, 8, 5, 2, 3), Rdiff = c(-10, -1, -2, 1, -3, 3, -1, -7, 5, 
-13, 4, 1, -3, -4, -1, 17, 1, -9, -1, 13, 4, -2, 2, 5, -4, -2, 
-1, -3, -1, 1), absV = c(10, 1, 2, 1, 3, 3, 1, 7, 5, 13, 4, 1, 
3, 4, 1, 17, 1, 9, 1, 13, 4, 2, 2, 5, 4, 2, 1, 3, 1, 1), App_Dec = c("GS-2, L", 
"GS-5", "GS-3, L", "GS-7, W", "GS-6, L", "GS-7, W", "GS-7, L", 
"GS-7, L", "GS-6, W", "GS-3, L", "GS-2", "GS-2", "GS-6, L", "GS-5, L", 
"GS-6, L", "GS-6, W", "GS-4", "GS-4, L", "GS-2", "GS-7, W", "GS-5, W", 
"GS-6", "GS-2", "GS-3", "GS-4", "GS-6, L", "GS-5", "GS-4", "GS-4", 
"GS-4"), IP = c(1.1, 5, 3, 7, 6, 6.1, 7, 7, 6, 2.2, 1.2, 2, 6, 
5, 5.1, 6, 4, 4, 2, 7, 5, 6, 1.2, 3, 4, 6, 5, 3.2, 4, 4), H = c(5, 
5, 8, 4, 6, 4, 4, 3, 3, 8, 8, 4, 6, 3, 7, 3, 10, 7, 3, 3, 4, 
3, 4, 4, 9, 6, 4, 4, 4, 4), R = c(6, 3, 5, 1, 4, 2, 2, 1, 1, 
6, 6, 3, 4, 2, 3, 1, 5, 5, 5, 1, 1, 2, 4, 2, 5, 4, 2, 1, 1, 3
), ER = c(6, 3, 3, 1, 4, 1, 2, 1, 1, 6, 6, 3, 4, 2, 3, 1, 5, 
4, 5, 1, 1, 2, 4, 2, 5, 1, 2, 1, 1, 3), BB = c(2, 2, 1, 1, 0, 
1, 2, 2, 4, 3, 0, 1, 2, 2, 1, 2, 0, 0, 4, 2, 2, 2, 4, 1, 0, 2, 
2, 2, 0, 3), SO = c(1, 10, 3, 4, 4, 7, 14, 10, 10, 5, 3, 1, 5, 
2, 5, 9, 3, 3, 3, 12, 8, 6, 0, 2, 2, 9, 2, 7, 4, 3), HR = c(0, 
2, 1, 0, 2, 1, 1, 1, 1, 2, 4, 0, 1, 1, 0, 0, 0, 2, 1, 1, 1, 0, 
0, 0, 1, 1, 0, 1, 1, 0), UER = c(0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0), 
    Pit = c(38, 113, 67, 107, 66, 95, 112, 100, 102, 76, 52, 
    40, 94, 81, 91, 102, 66, 71, 70, 108, 81, 100, 52, 69, 84, 
    103, 86, 60, 57, 70), Str = c(24, 78, 42, 68, 45, 66, 70, 
    70, 62, 45, 30, 25, 66, 52, 60, 68, 45, 49, 37, 74, 50, 65, 
    22, 41, 53, 72, 55, 39, 33, 37), GSc = c(19, 53, 29, 68, 
    48, 65, 73, 75, 68, 20, 18, 36, 47, 53, 46, 69, 25, 33, 29, 
    77, 61, 62, 27, 44, 26, 57, 51, 54, 54, 42), BF = c(12, 22, 
    19, 26, 23, 24, 26, 26, 24, 18, 14, 11, 26, 20, 24, 23, 21, 
    20, 14, 26, 21, 23, 13, 15, 21, 27, 20, 16, 15, 18), AB = c(8, 
    20, 18, 24, 23, 23, 23, 23, 20, 15, 13, 9, 24, 18, 22, 21, 
    21, 20, 9, 24, 19, 21, 8, 13, 20, 25, 18, 14, 15, 15), H2B = c(2, 
    0, 1, 1, 1, 0, 2, 0, 2, 2, 1, 2, 1, 0, 2, 1, 1, 1, 1, 1, 
    0, 0, 1, 0, 2, 2, 2, 0, 1, 0), H3B = c(0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 
    0, 0, 0, 1, 0), IBB = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0), 
    HBP = c(1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 
    0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0), SH = c(0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 1, 0, 0, 0, 0, 0), SF = c(1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 
    0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 
    0), GDP = c(0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1), SB = c(0, 1, 
    1, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 2, 0, 
    1, 0, 0, 0, 3, 0, 0, 0, 0), CS = c(0, 0, 0, 0, 1, 0, 1, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0), PO = c(0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), BK = c(0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0), WP = c(0, 1, 1, 1, 0, 0, 0, 
    0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 
    0, 1, 0, 0), ERA = c("40.5", "5.4", "9", "1.29", "6", "1.42", 
    "2.57", "1.29", "1.5", "20.25", "32.4", "13.5", "6", "3.6", 
    "5.0599999999999996", "1.5", "11.25", "9", "22.5", "1.29", 
    "1.8", "3", "21.6", "6", "11.25", "1.5", "3.6", "2.4500000000000002", 
    "2.25", "6.75"), WPA = c(-0.471, -0.087, -0.256, 0.34, -0.22, 
    0.18, 0.107, 0.219, 0.229, -0.358, -0.487, -0.186, -0.156, 
    0.036, -0.047, 0.049, -0.329, -0.321, -0.34, 0.193, 0.156, 
    0.07, -0.312, -0.042, -0.278, -0.271, 0.029, 0.02, 0.092, 
    -0.174), RE24 = c(-5.122, -0.193, -3.316, 2.931, -1.08, 1.509, 
    1.406, 2.406, 1.92, -4.641, -5.444, -1.919, -0.758, 0.679, 
    0.245, 2.215, -3.054, -3.054, -4.027, 2.406, 1.433, 0.92, 
    -3.788, -0.359, -2.812, -1.08, 0.707, 0.364, 1.166, -0.834
    ), aLI = c(1.45, 1.244, 0.974, 1.271, 0.965, 0.921, 0.955, 
    0.888, 1.066, 0.962, 0.767, 1.073, 0.941, 0.852, 1.353, 0.392, 
    0.857, 0.805, 0.904, 0.75, 1.037, 0.861, 1.232, 1.355, 0.914, 
    1.239, 1.213, 1.28, 0.748, 1.407)), row.names = c(NA, -30L
), class = c("tbl_df", "tbl", "data.frame"))

Desired output:

The numbers starting in the second column are the total absV values for each player for each column. The last column contains the sum of all the absV values for each player where absV > 5. Only a sample of the first 3 rows are shown, and the absV values are just filler numbers.

| Player | 1 | 2 | 3 | 4 | 5 | >5 |
| deGrom | 2 | 3 | 5 | 0 | 1 | 3 |
| Matz | 2 | 3 | 5 | 0 | 1 | 3 |

Code tried (I need help getting beyond the point shown). I would prefer if the code uses dplyr:

starter %>%
  select(Player, absV) %>%
  group_by(Player, absV) %>%
  summarize(numG= n()) %>%
  arrange(Player,absV)

To do this you to bifurcate your data with rows per player >5 and <=5 , then rbind them together and thereafter pivot_wider . Follow this code

library(dplyr) 
library(tidyr) 

df <- starter %>% group_by(Player) %>% 
  mutate(row = row_number()) %>% 
  select(Player, absV, row) %>% arrange(Player) 

df %>% filter(row <= 5) %>% 
  mutate(row = as.character(row)) %>%
  rbind(df %>% filter(row > 5) %>% 
          summarise( absV = sum(absV))  %>% 
          mutate(row = ">5")) %>% 
  pivot_wider(id_cols = Player, names_from = row, values_from = absV)

# A tibble: 8 x 7
# Groups:   Player [8]
  Player            `1`   `2`   `3`   `4`   `5`  `>5`
  <chr>           <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Ariel Jurado        4    NA    NA    NA    NA    NA
2 David Peterson      1     5     4     1     1    NA
3 Jacob deGrom        1     1     1    17    13     2
4 Michael Wacha       3     9     5    NA    NA    NA
5 Rick Porcello       2     7     3     1     2     1
6 Robert Gsellman     2     1    NA    NA    NA    NA
7 Seth Lugo          10     3     4     1     4     3
8 Steven Matz        13    NA    NA    NA    NA    NA

Note. Loading tidyverse package, at once, directly is advised.

Note-2 If you still want to sort absV before changing the data-format, add absV in arrange syntax beforehand joining them..

df <- starter %>% group_by(Player) %>% 
  arrange(Player, absV) %>%
  mutate(row = row_number()) %>% 
  select(Player, absV, row) 

df %>% filter(row <= 5) %>% 
  mutate(row = as.character(row)) %>%
  rbind(df %>% filter(row > 5) %>% 
          summarise( absV = sum(absV))  %>% 
          mutate(row = ">5")) %>% 
  pivot_wider(id_cols = Player, names_from = row, values_from = absV)

#this will give the following diff output

# A tibble: 8 x 7
# Groups:   Player [8]
  Player            `1`   `2`   `3`   `4`   `5`  `>5`
  <chr>           <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Ariel Jurado        4    NA    NA    NA    NA    NA
2 David Peterson      1     1     1     4     5    NA
3 Jacob deGrom        1     1     1     2    13    17
4 Michael Wacha       3     5     9    NA    NA    NA
5 Rick Porcello       1     1     2     2     3     7
6 Robert Gsellman     1     2    NA    NA    NA    NA
7 Seth Lugo           1     3     3     4     4    10
8 Steven Matz        13    NA    NA    NA    NA    NA

Additional Question in comments below

Follow this code to work out frequency of each absV

df %>% group_by(Player, absV) %>% mutate(freq = n()) %>% ungroup()

#check it
df %>% group_by(Player, absV) %>% mutate(freq = n()) %>% ungroup() %>% select(Player, absV, freq)
   Player          absV  freq
   <chr>          <dbl> <int>
 1 Seth Lugo         10     1
 2 Jacob deGrom       1     3
 3 Rick Porcello      2     2
 4 David Peterson     1     3
 5 Michael Wacha      3     1
 6 Seth Lugo          3     2
 7 Jacob deGrom       1     3
 8 Rick Porcello      7     1
 9 David Peterson     5     1
10 Steven Matz       13     1
# ... with 20 more rows

Using data.table

library(data.table)
dcast(setDT(starter), Player ~ rowid(Player), value.var = 'absV')

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM