簡體   English   中英

我可以組合一個dplyr mutate_at和mutate_if語句嗎?

[英]Can I combine a dplyr mutate_at & mutate_if statement?

我有以下示例輸出:

   country country-year year     a     b
1  France  France2000   2000       NA    NA 
2  France  France2001   2001     1000  1000  
3  France  France2002   2002       NA    NA
4  France  France2003   2003     1600  2200
5  France  France2004   2004       NA    NA
6  UK          UK2000   2000     1000  1000  
7  UK          UK2001   2001       NA    NA
8  UK          UK2002   2002     1000  1000  
9  UK          UK2003   2003       NA    NA
10 UK          UK2004   2004       NA    NA
11 Germany     UK2000   2000       NA    NA 
12 Germany     UK2001   2001       NA    NA
13 Germany     UK2002   2002       NA    NA  
14 Germany     UK2003   2003       NA    NA
15 Germany     UK2004   2004       NA    NA

我想插入數據I(但不是外推),並刪除列ab都是NA的列。 換句話說,我想刪除所有無法插入的列; 在示例中:

1  France  France2000        NA    NA
5  France  France2004        NA    NA
9  UK          UK2003        NA    NA
10 UK          UK2004        NA    NA
11 Germany     UK2000        NA    NA 
12 Germany     UK2001        NA    NA
13 Germany     UK2002        NA    NA  
14 Germany     UK2003        NA    NA
15 Germany     UK2004        NA    NA

有兩種選擇幾乎可以滿足我的需求:

library(tidyverse)
library(zoo)
df %>%
  group_by(country) %>%
  mutate_at(vars(a:b),~na.fill(.x,c(NA, "extend", NA))) %>% 
  filter(!is.na(a) | !is.na(b))

df%>% 
  group_by(Country)%>% 
  mutate_if(is.numeric,~if(all(is.na(.x))) NA else na.fill(.x,"extend"))

是否可以組合這些代碼,做這樣的事情:

df <- df%>%
  group_by(country)%>%
  mutate_at(vars(a:b),~if(all(is.na(.x))) NA else(.x,c(NA, "extend", NA)))
  filter(!is.na(df$a | df$a))

期望的輸出:

   country country-year    a     b 
2  France  France2001      1000  1000  
3  France  France2002      1300  1600
4  France  France2003      1600  2200
6  UK          UK2000      1000  1000  
7  UK          UK2001         0     0
8  UK          UK2002      1000  1000

不幸的是,@ kath onyl的解決方案在給定示例中起作用,但如果只有一列包含數據則失敗,例如:

country country-year year     a     b
France  France2000   2000       NA    NA 
France  France2001   2001     1000  1000  
France  France2002   2002       NA    NA
France  France2003   2003     1600  2200
France  France2004   2004       NA    NA
UK          UK2000   2000     1000  1000  
UK          UK2001   2001       NA    NA
UK          UK2002   2002     1000  1000  
UK          UK2003   2003       NA    NA
UK          UK2004   2004       NA    NA
Germany     UK2000   2000       NA    NA 
Germany     UK2001   2001       NA   500
Germany     UK2002   2002       NA    NA  
Germany     UK2003   2003       NA  1100
Germany     UK2004   2004       NA    NA

不幸的是,OP問題的答案是否定的,你不能混合mutate_at和mutate_if(沒有允許你指定.predicate和.vars的函數)

但是你可以在mutate_at中使用的函數中使用預測函數。 所以這是我使用包含預測函數的mutate_at的解決方案:

df %>%
  group_by(country) %>%
  # Interpolate if at least two non-null values are present
  mutate_at(vars(a,b), funs(if(sum(!is.na(.))<2) {NA_real_} else{approx(year, ., year)$y})) %>% 
  # keep only rows with original or interpolated values in either column a or b
  filter_at(vars(a,b), any_vars(!is.na(.)))

我知道這並沒有直接回答如何組合mutate_ifmutate_at ,但這解決了你的一般問題:

我首先擺脫了所有a和b都缺失的國家,然后確定每個國家的最低和最高年份,這是不缺少的。 過濾這些后,我使用na.fill

library(dplyr)
library(readr)
library(zoo)

country_data %>% 
  mutate(Year = parse_number(`country-year`)) %>% 
  group_by(country) %>% 
  mutate(not_all_na = any(!(is.na(a) & is.na(b)))) %>% 
  filter(not_all_na) %>% 
  mutate(Year_min_not_na = min(Year[!(is.na(a) & is.na(b))]), 
         Year_max_not_na = max(Year[!(is.na(a) & is.na(b))])) %>% 
  filter(Year >= Year_min_not_na, Year <= Year_max_not_na) %>% 
  mutate_at(vars(a:b), ~na.fill(.x, "extend")) 

# A tibble: 6 x 8
# Groups:   country [2]
#   country `country-year`     a     b  Year not_all_na Year_min_not_na Year_max_not_na
#   <fct>   <fct>          <dbl> <dbl> <dbl> <lgl>                <dbl>           <dbl>
# 1 France  France2001      1000  1000  2001 TRUE                  2001            2003
# 2 France  France2002      1300  1600  2002 TRUE                  2001            2003
# 3 France  France2003      1600  2200  2003 TRUE                  2001            2003
# 4 UK      UK2000          1000  1000  2000 TRUE                  2000            2002
# 5 UK      UK2001          1000  1000  2001 TRUE                  2000            2002
# 6 UK      UK2002          1000  1000  2002 TRUE                  2000            2002

數據

country_data <- 
  structure(list(country = structure(c(1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L), 
                                                   .Label = c("France", "Germany", "UK"), class = "factor"), 
                               country.year = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 6L, 7L, 8L, 9L, 10L), 
                                                        .Label = c("France2000", "France2001", "France2002", "France2003", 
                                                                   "France2004", "UK2000", "UK2001", "UK2002", "UK2003", "UK2004"), 
                                                        class = "factor"), 
                               a = c(NA, 1000L, NA, 1600L, NA, 1000L, NA, 1000L, NA, NA, NA, NA, NA, NA, NA),
                               b = c(NA, 1000L, NA, 2200L, NA, 1000L, NA, 1000L, NA, NA, NA, NA, NA, NA, NA)), 
                          class = "data.frame", row.names = c(NA, -15L))

這是我的看法:

library(data.table)
library(tidyverse)
library(zoo)

df <- fread("
n   country country-year    a     b
1  France  France2000        NA    NA 
2  France  France2001      1000  1000  
3  France  France2002        NA    NA
4  France  France2003      1600  2200
5  France  France2004        NA    NA
6  UK          UK2000      1000  1000  
7  UK          UK2001        NA    NA
8  UK          UK2002      1000  1000  
9  UK          UK2003        NA    NA
10 UK          UK2004        NA    NA
11 Germany     UK2000        NA    NA 
12 Germany     UK2001        NA    NA
13 Germany     UK2002        NA    NA  
14 Germany     UK2003        NA    NA
15 Germany     UK2004        NA    NA
            ") %>% select(-n)

# Clean data
df <- df %>% 
  mutate(year = str_extract_all(`country-year`, "[0-9]{4}$", simplify = T)) %>% 
  select(country, year, a, b) 

# Remove all rows NA in a and b if there is no earlier 
# or later row with value for a and b
# I hope this was what you meant with extrapolate :)
df <- df %>% 
  group_by(country) %>% 
  filter(year >= min(year[!is.na(a) | !is.na(b)]),
         year <= max(year[!is.na(a) | !is.na(b)])) %>% 
  ungroup()

# Intrapolate
df %>% 
  mutate_at(vars(a:b), ~na.fill(., "extend"))

結果:

# A tibble: 6 x 4
  country year      a     b
  <chr>   <chr> <dbl> <dbl>
1 France  2001  1000. 1000.
2 France  2002  1300. 1600.
3 France  2003  1600. 2200.
4 UK      2000  1000. 1000.
5 UK      2001  1000. 1000.
6 UK      2002  1000. 1000.

這是使用filterslice的另外兩種方法。 第一種方法應該與OP的尋找方式最接近:

library(dplyr)
library(zoo)

df %>%
  group_by(country) %>%
  mutate_if(is.numeric, na.approx, na.rm = FALSE) %>%
  filter(!is.na(a|b))

或者用slice

df %>%
  group_by(country) %>%
  filter(any(!is.na(a|b))) %>%
  slice(min(which(!is.na(a|b))):max(which(!is.na(a|b)))) %>%
  mutate_if(is.numeric, na.approx)

結果:

# A tibble: 6 x 4
# Groups:   country [2]
  country country.year     a     b
  <fct>   <fct>        <dbl> <dbl>
1 France  France2001    1000  1000
2 France  France2002    1300  1600
3 France  France2003    1600  2200
4 UK      UK2000        1000  1000
5 UK      UK2001        1000  1000
6 UK      UK2002        1000  1000

數據:

df <- structure(list(country = structure(c(1L, 1L, 1L, 1L, 1L, 3L, 
3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L), .Label = c("France", "Germany", 
"UK"), class = "factor"), country.year = structure(c(1L, 2L, 
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 6L, 7L, 8L, 9L, 10L), .Label = c("France2000", 
"France2001", "France2002", "France2003", "France2004", "UK2000", 
"UK2001", "UK2002", "UK2003", "UK2004"), class = "factor"), a = c(NA, 
1000L, NA, 1600L, NA, 1000L, NA, 1000L, NA, NA, NA, NA, NA, NA, 
NA), b = c(NA, 1000L, NA, 2200L, NA, 1000L, NA, 1000L, NA, NA, 
NA, NA, NA, NA, NA)), .Names = c("country", "country.year", "a", 
"b"), class = "data.frame", row.names = c("1", "2", "3", "4", 
"5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15"))

隨着dplyr 0.8.3的啟發:

library(dplyr)
(iris [1:3,] 
    %>% mutate_at(c("Petal.Width"),
                  list(~ifelse(Sepal.Width == 3.5, 
                               .+10,
                               .+100)
                  )
    )
)
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1          5.1         3.5          1.4        10.2  setosa
#> 2          4.9         3.0          1.4       100.2  setosa
#> 3          4.7         3.2          1.3       100.2  setosa

使用新列toto:

library(dplyr)
(iris [1:3,] 
  %>% mutate_at(c("Petal.Width"),
                list(toto=~ifelse(Sepal.Width == 3.5, 
                             .+10,
                             .+100)
                )
  )
)
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species  toto
#> 1          5.1         3.5          1.4         0.2  setosa  10.2
#> 2          4.9         3.0          1.4         0.2  setosa 100.2
#> 3          4.7         3.2          1.3         0.2  setosa 100.2

reprex包創建於2019-07-30(v0.2.1)

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM