简体   繁体   中英

Syntax for multiple successive operations across columns in `dplyr`

I'm struggling with the right syntax for multiple successive operations across columns in dplyr . In this data:

df <- structure(list(A1 = c(838.611, 824.048, 668.901, 225.075, 0, 
                      0, 341.291, 0, 101.652, 127.341, 0, 297.092, 0, 0, 0, 0, 0, 764.737, 
                      759.51, 772.21), A2 = c(499.041, 492.997, 486.132, 469.503, 476.782, 
                                              464.18, 469.833, 462.317, 455.507, 441.47, 490.147, 430.844, 
                                              0, 0, 0, 0, 0, 0, 0, 124.068)), row.names = c(NA, 20L), class = "data.frame")

say, I want to implement the following changes across columns A1 and A2 :

    1. replace 0 with NA
    1. set outliers to NA
    1. interpolate NA

Using the following syntax only performs change 1. but not 2. and 3.:

library(dplyr)
library(zoo)
df %>%
  mutate(across(starts_with("A"),
                ~na_if(.,0),
                ~ifelse(. %in% boxplot(.)$out, NA, .),
                ~na.approx(., na.rm = FALSE, rule = 2)))
        A1      A2
1  838.611 499.041
2  824.048 492.997
3  668.901 486.132
4  225.075 469.503
5       NA 476.782
6       NA 464.180
7  341.291 469.833
8       NA 462.317
9  101.652 455.507
10 127.341 441.470
11      NA 490.147
12 297.092 430.844
13      NA      NA
14      NA      NA
15      NA      NA
16      NA      NA
17      NA      NA
18 764.737      NA
19 759.510      NA
20 772.210 124.068

EDIT : The correct output is obtained from this (repetitive) type of code (which I'd like to avoid):

df %>%
  mutate(across(starts_with("A"),
                ~na_if(.,0))) %>%
  mutate(across(starts_with("A"),        
                ~ifelse(. %in% boxplot(.)$out, NA, .))) %>%
  mutate(across(starts_with("A"),
                ~na.approx(., na.rm = FALSE, rule = 2)))

         A1      A2
1  838.6110 499.041
2  824.0480 492.997
3  668.9010 486.132
4  225.0750 469.503
5  263.8137 476.782
6  302.5523 464.180
7  341.2910 469.833
8  221.4715 462.317
9  101.6520 455.507
10 127.3410 441.470
11 212.2165 490.147
12 297.0920 430.844
13 375.0328 430.844
14 452.9737 430.844
15 530.9145 430.844
16 608.8553 430.844
17 686.7962 430.844
18 764.7370 430.844
19 759.5100 430.844
20 772.2100 430.844

To answer OP's question in comments.

df %>%
  mutate(
    across(
      starts_with("A"),
      list(
        ~na_if(.,0),
        ~ifelse(. %in% boxplot(.)$out, NA, .),
        ~na.approx(., na.rm = FALSE, rule = 2)
       )
     )
   )
        A1      A2    A1_1    A1_2    A1_3    A2_1    A2_2    A2_3
1  838.611 499.041 838.611 838.611 838.611 499.041 499.041 499.041
2  824.048 492.997 824.048 824.048 824.048 492.997 492.997 492.997
3  668.901 486.132 668.901 668.901 668.901 486.132 486.132 486.132
4  225.075 469.503 225.075 225.075 225.075 469.503 469.503 469.503
5    0.000 476.782      NA   0.000   0.000 476.782 476.782 476.782
6    0.000 464.180      NA   0.000   0.000 464.180 464.180 464.180
7  341.291 469.833 341.291 341.291 341.291 469.833 469.833 469.833
8    0.000 462.317      NA   0.000   0.000 462.317 462.317 462.317
9  101.652 455.507 101.652 101.652 101.652 455.507 455.507 455.507
10 127.341 441.470 127.341 127.341 127.341 441.470 441.470 441.470
11   0.000 490.147      NA   0.000   0.000 490.147 490.147 490.147
12 297.092 430.844 297.092 297.092 297.092 430.844 430.844 430.844
13   0.000   0.000      NA   0.000   0.000      NA   0.000   0.000
14   0.000   0.000      NA   0.000   0.000      NA   0.000   0.000
15   0.000   0.000      NA   0.000   0.000      NA   0.000   0.000
16   0.000   0.000      NA   0.000   0.000      NA   0.000   0.000
17   0.000   0.000      NA   0.000   0.000      NA   0.000   0.000
18 764.737   0.000 764.737 764.737 764.737      NA   0.000   0.000
19 759.510   0.000 759.510 759.510 759.510      NA   0.000   0.000
20 772.210 124.068 772.210 772.210 772.210 124.068 124.068 124.068

You can give the output columns more meaningful names by (amongst other things) naming the list elements:

df %>%
  mutate(
    across(
      starts_with("A"),
      list(
        "Zero"=~na_if(.,0),
        "BoxPlot"=~ifelse(. %in% boxplot(.)$out, NA, .),
        "Approx"=~na.approx(., na.rm = FALSE, rule = 2)
       )
     )
   )
        A1      A2 A1_Zero A1_BoxPlot A1_Approx A2_Zero A2_BoxPlot A2_Approx
1  838.611 499.041 838.611    838.611   838.611 499.041    499.041   499.041
2  824.048 492.997 824.048    824.048   824.048 492.997    492.997   492.997
...

Update in response to OP's comment below

across() has a .names argument that allows control over the naming of the output columns, but that won't work here because across() outputs one column for every combination of input column and function. We want to apply multiple functions to each input column, producing a single output column for each input column. To do that, wrap the functions that wrangle each column in a single function. This has the same effect as the multiple mutate calls in OP's edit of the original question.

df %>%
  mutate(
    across(
      starts_with("A"),
      function(.x) {
         .x <- na_if(.x, 0)
        .x <- ifelse(.x %in% boxplot(.x)$out, NA, .x)
        .x <- na.approx(.x, na.rm = FALSE, rule = 2)
        .x
      }
    )
  )
         A1      A2
1  838.6110 499.041
2  824.0480 492.997
3  668.9010 486.132
4  225.0750 469.503
5  263.8137 476.782
6  302.5523 464.180
7  341.2910 469.833
8  221.4715 462.317
9  101.6520 455.507
10 127.3410 441.470
11 212.2165 490.147
12 297.0920 430.844
13 375.0328 430.844
14 452.9737 430.844
15 530.9145 430.844
16 608.8553 430.844
17 686.7962 430.844
18 764.7370 430.844
19 759.5100 430.844
20 772.2100 430.844

I wrote a custom function for clarity which can be applied to multiple columns in across .

library(dplyr)
library(zoo)

apply_fun <- function(x) {
  na_if(x, 0) %>%
    ifelse(. %in% boxplot(.)$out, NA, .) %>%
    na.approx(., na.rm = FALSE, rule = 2)
}


df %>% mutate(across(starts_with("A"),apply_fun))

#         A1      A2
#1  838.6110 499.041
#2  824.0480 492.997
#3  668.9010 486.132
#4  225.0750 469.503
#5  263.8137 476.782
#6  302.5523 464.180
#7  341.2910 469.833
#8  221.4715 462.317
#9  101.6520 455.507
#10 127.3410 441.470
#11 212.2165 490.147
#12 297.0920 430.844
#13 375.0328 430.844
#14 452.9737 430.844
#15 530.9145 430.844
#16 608.8553 430.844
#17 686.7962 430.844
#18 764.7370 430.844
#19 759.5100 430.844
#20 772.2100 430.844

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM