I have a huge df with 10 million observations and 50 variables as x. Currently I'm using "grepl", "str_replace" and "gsub" functions as follows for data cleaning which are very time consuming (each line 5 mins).
Is there any more efficient function or way to rewrite the codes to reduce run time, please?
x <-x[!grepl("A",x$ITEM_1, perl=TRUE,]
x <-x[!grepl("B",x$ITEM_1),perl=TRUE,]
x <-x[!grepl("C",x$ITEM_1),perl=TRUE,]
x <-x[!grepl("D",x$ITEM_1),perl=TRUE,]
x <-x[!grepl("E",x$ITEM_2),perl=TRUE,]
x <- x %>% mutate_at(vars(2:50), funs(gsub("\\?", "", .,perl=TRUE)))
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"#","")
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"@","")
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"~","")
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"\\(","")
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"\\)","")
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"&","")
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"\\\\","")
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"/","")
Regards,
The following shows comparative timings of the OP's code in the question and a simplification of that code.
It was tested with a dataframe of n = 10000
rows and 50
character column-vectors. The speedup is worthwhile.
library(dplyr)
library(stringr)
library(stringi)
library(microbenchmark)
fun.OP <- function(x){
x <- x[!grepl("A", x$ITEM_1, perl = TRUE), ]
x <- x[!grepl("B", x$ITEM_1, perl = TRUE), ]
x <- x[!grepl("C", x$ITEM_1, perl = TRUE), ]
x <- x[!grepl("D", x$ITEM_1, perl = TRUE), ]
x <- x[!grepl("E", x$ITEM_2, perl = TRUE), ]
x <- x %>% mutate_at(vars(2:ncol(x)), list(~gsub("\\?", "", .,perl=TRUE)))
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"#","")
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"@","")
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"~","")
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"\\(","")
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"\\)","")
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"&","")
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"\\\\","")
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"/","")
x
}
fun.Rui <- function(x){
x <- x[!grepl('[A-D]', x$ITEM_1, perl = TRUE), ]
x <- x[!grepl('E', x$ITEM_2, perl = TRUE), ]
x[2:ncol(x)] <- lapply(x[2:ncol(x)], function(y) stri_replace_all_fixed(y, '?', ''))
x$SUBNAMEZ <- stri_replace_all_regex(x$SUBNAMEZ, '#|@|~|\\(|\\)|&|/|', '')
x$SUBNAMEZ <- stri_replace_all_regex(x$SUBNAMEZ, '\\\\', '')
row.names(x) <- NULL
x
}
y1 <- fun.OP(x)
y2 <- fun.Rui(x)
dim(y1)
dim(y2)
identical(y1, y2)
mb <- microbenchmark(
OP = fun.OP(x),
Rui = fun.Rui(x)
)
print(mb, order = 'median')
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# Rui 17.05596 17.21667 21.41270 17.30466 17.44592 62.58906 100 a
# OP 42.88685 43.25211 54.68897 43.53331 43.98865 501.98495 100 b
Data creation code.
makeString <- function(col, N){
y <- character(N)
if(col == 1){
L <- LETTERS
}else if(col == 2){
L <- c(LETTERS, '?')
} else{
L <- c(LETTERS, '@', '#', '~', '(', ')', '\\', '/')
}
for(i in seq_len(N)){
y[i] <- paste(sample(L, sample(50, 1), TRUE), collapse = '')
}
y
}
set.seed(1234)
n <- 1e4
x <- lapply(1:50, function(i) makeString(i, n))
names(x) <- sprintf("V%02d", seq_along(x))
x <- do.call(cbind.data.frame, x)
names(x)[1:3] <- c('ITEM_1', 'ITEM_2', 'SUBNAMEZ')
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.