简体   繁体   中英

Can I combine a dplyr mutate_at & mutate_if statement?

I have the following example output:

   country country-year year     a     b
1  France  France2000   2000       NA    NA 
2  France  France2001   2001     1000  1000  
3  France  France2002   2002       NA    NA
4  France  France2003   2003     1600  2200
5  France  France2004   2004       NA    NA
6  UK          UK2000   2000     1000  1000  
7  UK          UK2001   2001       NA    NA
8  UK          UK2002   2002     1000  1000  
9  UK          UK2003   2003       NA    NA
10 UK          UK2004   2004       NA    NA
11 Germany     UK2000   2000       NA    NA 
12 Germany     UK2001   2001       NA    NA
13 Germany     UK2002   2002       NA    NA  
14 Germany     UK2003   2003       NA    NA
15 Germany     UK2004   2004       NA    NA

I want to interpolate the data I (but not extrapolate), and remove the columns for which columns a and b are both NA. In other words I would like to remove all the columns for which I cannot interpolate; in the example:

1  France  France2000        NA    NA
5  France  France2004        NA    NA
9  UK          UK2003        NA    NA
10 UK          UK2004        NA    NA
11 Germany     UK2000        NA    NA 
12 Germany     UK2001        NA    NA
13 Germany     UK2002        NA    NA  
14 Germany     UK2003        NA    NA
15 Germany     UK2004        NA    NA

There are two options that almost do what I want:

library(tidyverse)
library(zoo)
df %>%
  group_by(country) %>%
  mutate_at(vars(a:b),~na.fill(.x,c(NA, "extend", NA))) %>% 
  filter(!is.na(a) | !is.na(b))

AND

df%>% 
  group_by(Country)%>% 
  mutate_if(is.numeric,~if(all(is.na(.x))) NA else na.fill(.x,"extend"))

Would it be possible to combine these codes, doing something like this:

df <- df%>%
  group_by(country)%>%
  mutate_at(vars(a:b),~if(all(is.na(.x))) NA else(.x,c(NA, "extend", NA)))
  filter(!is.na(df$a | df$a))

Desired output:

   country country-year    a     b 
2  France  France2001      1000  1000  
3  France  France2002      1300  1600
4  France  France2003      1600  2200
6  UK          UK2000      1000  1000  
7  UK          UK2001         0     0
8  UK          UK2002      1000  1000

Unfortunately the solution of @kath onyl works in given example but fails if only one column contains data, eg:

country country-year year     a     b
France  France2000   2000       NA    NA 
France  France2001   2001     1000  1000  
France  France2002   2002       NA    NA
France  France2003   2003     1600  2200
France  France2004   2004       NA    NA
UK          UK2000   2000     1000  1000  
UK          UK2001   2001       NA    NA
UK          UK2002   2002     1000  1000  
UK          UK2003   2003       NA    NA
UK          UK2004   2004       NA    NA
Germany     UK2000   2000       NA    NA 
Germany     UK2001   2001       NA   500
Germany     UK2002   2002       NA    NA  
Germany     UK2003   2003       NA  1100
Germany     UK2004   2004       NA    NA

Unfortunately too, the answer to the OPs question is no, you can't mix mutate_at and mutate_if (there's no function that allows you to specify .predicate and .vars)

but you can use a predict function within the function used in mutate_at. So here is my solution using mutate_at containing a predict function:

df %>%
  group_by(country) %>%
  # Interpolate if at least two non-null values are present
  mutate_at(vars(a,b), funs(if(sum(!is.na(.))<2) {NA_real_} else{approx(year, ., year)$y})) %>% 
  # keep only rows with original or interpolated values in either column a or b
  filter_at(vars(a,b), any_vars(!is.na(.)))

I know this doesn't directly answer the question how to combine mutate_if and mutate_at , but this solves your general problem:

I first get rid of the countries where all a and b are missing, and then determine for each country the minimum and maximum Year, which is not missing. After filtering these, I use the na.fill .

library(dplyr)
library(readr)
library(zoo)

country_data %>% 
  mutate(Year = parse_number(`country-year`)) %>% 
  group_by(country) %>% 
  mutate(not_all_na = any(!(is.na(a) & is.na(b)))) %>% 
  filter(not_all_na) %>% 
  mutate(Year_min_not_na = min(Year[!(is.na(a) & is.na(b))]), 
         Year_max_not_na = max(Year[!(is.na(a) & is.na(b))])) %>% 
  filter(Year >= Year_min_not_na, Year <= Year_max_not_na) %>% 
  mutate_at(vars(a:b), ~na.fill(.x, "extend")) 

# A tibble: 6 x 8
# Groups:   country [2]
#   country `country-year`     a     b  Year not_all_na Year_min_not_na Year_max_not_na
#   <fct>   <fct>          <dbl> <dbl> <dbl> <lgl>                <dbl>           <dbl>
# 1 France  France2001      1000  1000  2001 TRUE                  2001            2003
# 2 France  France2002      1300  1600  2002 TRUE                  2001            2003
# 3 France  France2003      1600  2200  2003 TRUE                  2001            2003
# 4 UK      UK2000          1000  1000  2000 TRUE                  2000            2002
# 5 UK      UK2001          1000  1000  2001 TRUE                  2000            2002
# 6 UK      UK2002          1000  1000  2002 TRUE                  2000            2002

Data

country_data <- 
  structure(list(country = structure(c(1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L), 
                                                   .Label = c("France", "Germany", "UK"), class = "factor"), 
                               country.year = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 6L, 7L, 8L, 9L, 10L), 
                                                        .Label = c("France2000", "France2001", "France2002", "France2003", 
                                                                   "France2004", "UK2000", "UK2001", "UK2002", "UK2003", "UK2004"), 
                                                        class = "factor"), 
                               a = c(NA, 1000L, NA, 1600L, NA, 1000L, NA, 1000L, NA, NA, NA, NA, NA, NA, NA),
                               b = c(NA, 1000L, NA, 2200L, NA, 1000L, NA, 1000L, NA, NA, NA, NA, NA, NA, NA)), 
                          class = "data.frame", row.names = c(NA, -15L))

Here is my take:

library(data.table)
library(tidyverse)
library(zoo)

df <- fread("
n   country country-year    a     b
1  France  France2000        NA    NA 
2  France  France2001      1000  1000  
3  France  France2002        NA    NA
4  France  France2003      1600  2200
5  France  France2004        NA    NA
6  UK          UK2000      1000  1000  
7  UK          UK2001        NA    NA
8  UK          UK2002      1000  1000  
9  UK          UK2003        NA    NA
10 UK          UK2004        NA    NA
11 Germany     UK2000        NA    NA 
12 Germany     UK2001        NA    NA
13 Germany     UK2002        NA    NA  
14 Germany     UK2003        NA    NA
15 Germany     UK2004        NA    NA
            ") %>% select(-n)

# Clean data
df <- df %>% 
  mutate(year = str_extract_all(`country-year`, "[0-9]{4}$", simplify = T)) %>% 
  select(country, year, a, b) 

# Remove all rows NA in a and b if there is no earlier 
# or later row with value for a and b
# I hope this was what you meant with extrapolate :)
df <- df %>% 
  group_by(country) %>% 
  filter(year >= min(year[!is.na(a) | !is.na(b)]),
         year <= max(year[!is.na(a) | !is.na(b)])) %>% 
  ungroup()

# Intrapolate
df %>% 
  mutate_at(vars(a:b), ~na.fill(., "extend"))

Result:

# A tibble: 6 x 4
  country year      a     b
  <chr>   <chr> <dbl> <dbl>
1 France  2001  1000. 1000.
2 France  2002  1300. 1600.
3 France  2003  1600. 2200.
4 UK      2000  1000. 1000.
5 UK      2001  1000. 1000.
6 UK      2002  1000. 1000.

Here's another two methods using filter and slice . This first approach should be closest to what OP's looking for:

library(dplyr)
library(zoo)

df %>%
  group_by(country) %>%
  mutate_if(is.numeric, na.approx, na.rm = FALSE) %>%
  filter(!is.na(a|b))

or with slice :

df %>%
  group_by(country) %>%
  filter(any(!is.na(a|b))) %>%
  slice(min(which(!is.na(a|b))):max(which(!is.na(a|b)))) %>%
  mutate_if(is.numeric, na.approx)

Result:

# A tibble: 6 x 4
# Groups:   country [2]
  country country.year     a     b
  <fct>   <fct>        <dbl> <dbl>
1 France  France2001    1000  1000
2 France  France2002    1300  1600
3 France  France2003    1600  2200
4 UK      UK2000        1000  1000
5 UK      UK2001        1000  1000
6 UK      UK2002        1000  1000

Data:

df <- structure(list(country = structure(c(1L, 1L, 1L, 1L, 1L, 3L, 
3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L), .Label = c("France", "Germany", 
"UK"), class = "factor"), country.year = structure(c(1L, 2L, 
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 6L, 7L, 8L, 9L, 10L), .Label = c("France2000", 
"France2001", "France2002", "France2003", "France2004", "UK2000", 
"UK2001", "UK2002", "UK2003", "UK2004"), class = "factor"), a = c(NA, 
1000L, NA, 1600L, NA, 1000L, NA, 1000L, NA, NA, NA, NA, NA, NA, 
NA), b = c(NA, 1000L, NA, 2200L, NA, 1000L, NA, 1000L, NA, NA, 
NA, NA, NA, NA, NA)), .Names = c("country", "country.year", "a", 
"b"), class = "data.frame", row.names = c("1", "2", "3", "4", 
"5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15"))

With dplyr 0.8.3 inspired by:

library(dplyr)
(iris [1:3,] 
    %>% mutate_at(c("Petal.Width"),
                  list(~ifelse(Sepal.Width == 3.5, 
                               .+10,
                               .+100)
                  )
    )
)
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1          5.1         3.5          1.4        10.2  setosa
#> 2          4.9         3.0          1.4       100.2  setosa
#> 3          4.7         3.2          1.3       100.2  setosa

with a new column toto:

library(dplyr)
(iris [1:3,] 
  %>% mutate_at(c("Petal.Width"),
                list(toto=~ifelse(Sepal.Width == 3.5, 
                             .+10,
                             .+100)
                )
  )
)
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species  toto
#> 1          5.1         3.5          1.4         0.2  setosa  10.2
#> 2          4.9         3.0          1.4         0.2  setosa 100.2
#> 3          4.7         3.2          1.3         0.2  setosa 100.2

Created on 2019-07-30 by the reprex package (v0.2.1)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM