I'm coming from Stata and need to do some work in R for a specific project. Unfortunately, I haven't had much success with functional programming when trying to reference a vector of names in a dataframe within a function.
I've coded a function that will (1) create a dummy in my dataset for if the value is missing and then (2) convert the NA to a zero, using dplyr and lazyeval (below). I've been unable to pass a vector of variable names to this function within a second function.
The working function which performs the data modification:
dumfrmna <- function(dset, varn) {
expr <- enquo(varn)
x <- dset %>% summarise(sum(is.na(!!expr)))
if(x > 0) {
varname <- paste0(quo_name(expr), "_mis", sep = "")
dset <- mutate(dset, !!varname := is.na(!!expr),
!!expr = ifelse(is.na(!!expr), 0, !!expr))
}
return(dset)
}
The second function that fails to pass character names as to the working function:
misdum <- function(dset, vlist = character(), type = "exclude") {
#Err messages
if(!(is.vector(vlist) & is.character(vlist))) stop("vlist must be a character vector of variable names")
if(!((type == "include") | (type == "exclude"))) stop("type must take either \"include\" or \"exclude\"")
if(type == "exclude" & length(vlist) > 0 & all(vlist %in% names(dset))) stop("vlist contains variables not in dset")
if(length(vlist) == 0 & type == "include") {
stop("If type == include, vlist must be specified")
}
# Create list of included vars
else if(length(vlist) == 0 & type == "exclude") {
vname <- colnames(dset)
}
else if(length(vlist) > 0 & type == "include") {
vname <- vlist
}
else if(length(vlist) > 0 & type == "exclude") {
vname <- names(dest) %>%
setdiff(vlist)
}
#Create missing dummy
dset <- sapply(vname, function(x) dumfrmna(dset, x))
return(dset)
}
Additionally this fails when I try to apply that vector of variable names to the function in R. I've attempted to use various tactics like noquote(), as.name(), parse(eval()) to modify the vector of column names in the sapply in the second function, but that just breaks on the conditional in the first function. Is there anyway to strip the quotes of that vector name?
How should this code be structured in R?
It is better to provide a small reproducible example to test the functions. Here, we are using the inbuilt dataset mtcars
library(dplyr)
library(purrr)
data(mtcars)
Create some NA's in some columns.
mtcars$wt[5:6] <- NA
mtcars$qsec[7:9] <- NA
Changes in the dumfrna
- In the !!expr =
statement it should be :=
also the LHS would take a symbol or character.
dumfrmna <- function(dset, varn) {
expr <- enquo(varn)
exprC <- quo_name(expr)
x <- dset %>% summarise(sum(is.na(!!expr)))
x
if(x > 0) {
varname <- paste0(quo_name(expr), "_mis", sep = "")
dset <- mutate(dset, !! (varname) := is.na(!!expr),
!! (exprC) := ifelse(is.na(!!expr), 0, !!expr))
}
return(dset)
}
In the second function, we evaluate the argument for dumfrna
ie !! x
!! x
. Here, we are assuming that vlist
takes a character vector
as argument
misdum <- function(dset, vlist = character(), type = "exclude") {
if(!(is.vector(vlist) & is.character(vlist))) stop("vlist must be a character vector of variable names")
if(!((type == "include") | (type == "exclude"))) stop("type must take either \"include\" or \"exclude\"")
if(type == "exclude" & length(vlist) > 0 & all(vlist %in% names(dset))) stop("vlist contains variables not in dset")
if(length(vlist) == 0 & type == "include") {
stop("If type == include, vlist must be specified")
}
# Create list of included vars
else if(length(vlist) == 0 & type == "exclude") {
vname <- colnames(dset)
}
else if(length(vlist) > 0 & type == "include") {
vname <- vlist
}
else if(length(vlist) > 0 & type == "exclude") {
vname <- names(dest) %>%
setdiff(vlist)
}
#Create missing dummy
dset <- map(rlang::syms(vname), ~ dumfrmna(dset, !! .x))
return(dset)
}
Try the functions with the data
res <- misdum(mtcars, vlist =c("wt", "qsec"), type = "include")
map(res, head, n = 9)
#[[1]]
# mpg cyl disp hp drat wt qsec vs am gear carb wt_mis
#1 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 FALSE
#2 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 FALSE
#3 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 FALSE
#4 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 FALSE
#5 18.7 8 360.0 175 3.15 0.000 17.02 0 0 3 2 TRUE
#6 18.1 6 225.0 105 2.76 0.000 20.22 1 0 3 1 TRUE
#7 14.3 8 360.0 245 3.21 3.570 NA 0 0 3 4 FALSE
#8 24.4 4 146.7 62 3.69 3.190 NA 1 0 4 2 FALSE
#9 22.8 4 140.8 95 3.92 3.150 NA 1 0 4 2 FALSE
#[[2]]
# mpg cyl disp hp drat wt qsec vs am gear carb qsec_mis
#1 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 FALSE
#2 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 FALSE
#3 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 FALSE
#4 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 FALSE
#5 18.7 8 360.0 175 3.15 NA 17.02 0 0 3 2 FALSE
#6 18.1 6 225.0 105 2.76 NA 20.22 1 0 3 1 FALSE
#7 14.3 8 360.0 245 3.21 3.570 0.00 0 0 3 4 TRUE
#8 24.4 4 146.7 62 3.69 3.190 0.00 1 0 4 2 TRUE
#9 22.8 4 140.8 95 3.92 3.150 0.00 1 0 4 2 TRUE
This approach simplifies some of the dplyr expressions to use base R indexing. Here the dumfrmna
function accepts a dataframe with single column only,
library(dplyr)
# function takes a single data frame column, and
# returns the unchanged column if there are no NAs otherwise
# returns NA s changed to zero and an additional column named with
# _mis as a suffix.
dumfrmna <- function(df_column) {
x <- sum(is.na(df_column))
if(x > 0) {
# new column, suffixed _mis, with TRUE for NAs
varname <- paste0(names(df_column)[1], "_mis")
df_column[varname] <- is.na(df_column[[1]])
# zero out the NAs in his single column
df_column[is.na(df_column[[1]]),1] <- 0
}
return(df_column)
}
misdum <- function(dset, vlist = character(), type = "exclude") {
#Err messages
if(!(is.vector(vlist) & is.character(vlist))) stop("vlist must be a character vector of variable names")
if(!((type == "include") | (type == "exclude"))) stop("type must take either \"include\" or \"exclude\"")
if(type == "exclude" & length(vlist) > 0 &
!all(vlist %in% names(dset))) stop("vlist contains variables not in dset") # mod !all
if(length(vlist) == 0 & type == "include") {
stop("If type == include, vlist must be specified")
}
# Create list of included vars
else if(length(vlist) == 0 & type == "exclude") {
vname <- colnames(dset)
}
else if(length(vlist) > 0 & type == "include") {
vname <- vlist
}
else if(length(vlist) > 0 & type == "exclude") {
vname <- names(dset) %>% ## mod dset
setdiff(vlist)
}
# Create missing dummy by passing the single column -
# the return from dumfrma is a dataframe with 1 or two columns
new_columns <- sapply(vname, function(x) dumfrmna(dset[x]), simplify = FALSE)
dset <- bind_cols(new_columns)
return(dset)
}
df = read.csv(text="
c1,c2,c3
a,1,11
b,2,12
c,NA,12
d,2,NA
", stringsAsFactors=FALSE)
# test the dummy insertion for a single column
dumfrmna(df["c1"]) ## should be unchangd
dumfrmna(df["c2"]) ## should add a column "c2_mis" and zero out the NA
dumfrmna(df["c3"]) ## should add a column "c3_mis" and zero out the NA
# test on the full frame
misdum(df, vlist = c("c2"), type = "exclude")
# > dumfrmna(df["c2"]) ## should add a column "c2_mis" and zero out the NA
# c2 c2_mis
# 1 1 FALSE
# 2 2 FALSE
# 3 0 TRUE
# 4 2 FALSE
#
# > dumfrmna(df["c3"]) ## should add a column "c3_mis" and zero out the NA
# c3 c3_mis
# 1 11 FALSE
# 2 12 FALSE
# 3 12 FALSE
# 4 0 TRUE
#
# > # test on the full frame
# > misdum(df, vlist = c("c2"), type = "exclude")
# c1 c3 c3_mis
# 1 a 11 FALSE
# 2 b 12 FALSE
# 3 c 12 FALSE
# 4 d 0 TRUE
# >
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.