簡體   English   中英

R:在tidyverse中創建一個function

[英]R: create a function in tidyverse

我有一些假數據:

library(tidyverse)
df <- data.frame(id = 1:20,
                 var1 = sample(c(0,1), size = 20, replace = T),
                 var2 = round(runif(20, min = 0, max = 100),0),
                 var3 = round(runif(20, min = 0, max = 100),0),
                 var4 = round(rnorm(20, mean = 50, sd = 20)),
                 var5 = sample(c(1:19, NA), size=20))

然后,我想對這些數據做一些檢查:。 有錯誤和錯誤消息的行的 ID 應該放在 data.frame errors 我想使用管道運算符 %>% 調用 function

### Different checks

# There should be no missing values in var5
df %>% filter(is.na(var5)) %>% add_errors("There are NAs in var5")

# var3 should be greater than var4
df %>% filter(var3 < var4) %>% add_errors("var3 is smaller than var4")

# ... etc.

然后我必須定義 function add_errors()

### Define function

errors <- data.frame(id = numeric(), errormessage = character())

add_errors <- function(dat, error){
    errors <<- add_case(errors, id = dat[['id']], errormessage = error)
}

我們可以在控制台上打印錯誤消息

add_errors <- function(dat, error) {
    glue::glue("{error} at id: {toString(dat[['id']])}")
   }

-測試

df %>%
    filter(is.na(var5)) %>% 
    add_errors("There are NAs in var5")
#There are NAs in var5 at id: 6

df %>%
   filter(var3 < var4) %>%
   add_errors("var3 is smaller than var4")
#var3 is smaller than var4 at id: 1, 2, 3, 4, 6, 7, 8, 11, 15, 16, 17, 20

或返回帶有錯誤消息的 tibble/data.frame output

add_errors <- function(dat, error) {
     tibble(id = dat[['id']], errormessage = error)
    }
    
df %>%
     filter(is.na(var5)) %>% 
     add_errors("There are NAs in var5")
# A tibble: 1 x 2
#     id errormessage         
#  <int> <chr>                
#1     6 There are NAs in var5

一個選項是使用logger ,它可以更靈活地添加錯誤、警告、信息等以及時間戳

#remotes::install_github('daroczig/logger')
library(logger)
log_layout(layout_glue_colors)
t <- tempfile()
log_appender(appender_file(t))
log_info('Script starting up...')

df %>%
     filter(is.na(var5)) %>%
    {log_error('There are NAs in var5')}
    
df %>%
   filter(var3 < var4) %>%
   {log_error("var3 is smaller than var4")}
cat(readLines(t), sep="\n")
#INFO [2021-02-28 14:28:42] Script starting up...
#ERROR [2021-02-28 14:28:42] There are NAs in var5
#ERROR [2021-02-28 14:28:43] var3 is smaller than var4

unlink(t)

t是臨時文件,也可以寫入自定義目標文件夾

以下代碼執行與您所要求的類似的操作。 我嘗試在不將錯誤數據幀作為參數傳遞的情況下執行此操作,但最終不會更改 function 之外的錯誤變量。

errors=data.frame(id=numeric(), errormessage=character())
add_errors=function(df, errormessage) {
    return(bind_rows(errors, data.frame(id=df$id, errormessage=errormessage)))
}
errors=df %>% filter(is.na(var5)) %>% add_errors("There are NAs in var5") 
errors=df %>% filter(var3 > var4) %>% add_errors("var3 is smaller than var4")

Output:

> print(errors)
  id              errormessage
1  3     There are NAs in var5
2  2 var3 is smaller than var4
3  3 var3 is smaller than var4
4  7 var3 is smaller than var4
5  8 var3 is smaller than var4
6  9 var3 is smaller than var4
7 12 var3 is smaller than var4
8 16 var3 is smaller than var4
9 18 var3 is smaller than var4

我知道這個問題是關於創建自定義 function 來檢查錯誤。 但是有一個很好的 package 叫做 {pointblank} 正是為這種任務而設計的。

我們可以設置一個所謂的“代理”並“詢問”它以獲得一個不錯的報告,而不是設置一個名為errordata.frame 有幾種替代工作流程可以檢查軟件包網站上描述的錯誤。 以下是在您的問題上使用 package 的一種可能方法。

library(dplyr)
library(pointblank)

df <- data.frame(id = 1:20,
                 var1 = sample(c(0,1), size = 20, replace = T),
                 var2 = round(runif(20, min = 0, max = 100),0),
                 var3 = round(runif(20, min = 0, max = 100),0),
                 var4 = round(rnorm(20, mean = 50, sd = 20)),
                 var5 = sample(c(1:19, NA), size=20))
agent <- df %>%
  create_agent(
    label = "My error checks",
    actions = action_levels(stop_at = 1)
  ) %>%
  col_vals_not_null(var5) %>% 
  col_vals_not_in_set(
    vars(var3_lt_4),
    preconditions = ~ . %>% dplyr::mutate(var3_lt_4 = var3 > var4),
    set = FALSE) %>% 
  interrogate()
  
agent

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM