简体   繁体   中英

reducing the data cleaning with more efficient and faster functions

I have a huge df with 10 million observations and 50 variables as x. Currently I'm using "grepl", "str_replace" and "gsub" functions as follows for data cleaning which are very time consuming (each line 5 mins).

Is there any more efficient function or way to rewrite the codes to reduce run time, please?

 x <-x[!grepl("A",x$ITEM_1, perl=TRUE,]
  x <-x[!grepl("B",x$ITEM_1),perl=TRUE,]
  x <-x[!grepl("C",x$ITEM_1),perl=TRUE,]
  x <-x[!grepl("D",x$ITEM_1),perl=TRUE,]
  x <-x[!grepl("E",x$ITEM_2),perl=TRUE,]

 x <- x %>% mutate_at(vars(2:50), funs(gsub("\\?", "", .,perl=TRUE)))

  x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"#","")
  x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"@","")
  x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"~","")
  x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"\\(","")
  x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"\\)","")
  x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"&","")
  x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"\\\\","")
  x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"/","")

Regards,

The following shows comparative timings of the OP's code in the question and a simplification of that code.
It was tested with a dataframe of n = 10000 rows and 50 character column-vectors. The speedup is worthwhile.

library(dplyr)
library(stringr)
library(stringi)
library(microbenchmark)

fun.OP <- function(x){
  x <- x[!grepl("A", x$ITEM_1, perl = TRUE), ]
  x <- x[!grepl("B", x$ITEM_1, perl = TRUE), ]
  x <- x[!grepl("C", x$ITEM_1, perl = TRUE), ]
  x <- x[!grepl("D", x$ITEM_1, perl = TRUE), ]
  x <- x[!grepl("E", x$ITEM_2, perl = TRUE), ]

  x <- x %>% mutate_at(vars(2:ncol(x)), list(~gsub("\\?", "", .,perl=TRUE)))

  x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"#","")
  x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"@","")
  x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"~","")
  x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"\\(","")
  x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"\\)","")
  x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"&","")
  x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"\\\\","")
  x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"/","")
  x
}

fun.Rui <- function(x){
  x <- x[!grepl('[A-D]', x$ITEM_1, perl = TRUE), ]
  x <- x[!grepl('E', x$ITEM_2, perl = TRUE), ]

  x[2:ncol(x)] <- lapply(x[2:ncol(x)], function(y) stri_replace_all_fixed(y, '?', ''))

  x$SUBNAMEZ <- stri_replace_all_regex(x$SUBNAMEZ, '#|@|~|\\(|\\)|&|/|', '')
  x$SUBNAMEZ <- stri_replace_all_regex(x$SUBNAMEZ, '\\\\', '')
  row.names(x) <- NULL
  x
}

y1 <- fun.OP(x)
y2 <- fun.Rui(x)
dim(y1)
dim(y2)
identical(y1, y2)

mb <- microbenchmark(
  OP = fun.OP(x),
  Rui = fun.Rui(x)
)
print(mb, order = 'median')
#Unit: milliseconds
# expr      min       lq     mean   median       uq       max neval cld
#  Rui 17.05596 17.21667 21.41270 17.30466 17.44592  62.58906   100  a 
#   OP 42.88685 43.25211 54.68897 43.53331 43.98865 501.98495   100   b

Data creation code.

makeString <- function(col, N){
  y <- character(N)
  if(col == 1){
    L <- LETTERS
  }else if(col == 2){
    L <- c(LETTERS, '?')
  } else{
    L <- c(LETTERS, '@', '#', '~', '(', ')', '\\', '/')
  }
  for(i in seq_len(N)){
    y[i] <- paste(sample(L, sample(50, 1), TRUE), collapse = '')
  }
  y
}

set.seed(1234)
n <- 1e4
x <- lapply(1:50, function(i) makeString(i, n))
names(x) <- sprintf("V%02d", seq_along(x))
x <- do.call(cbind.data.frame, x)
names(x)[1:3] <- c('ITEM_1', 'ITEM_2', 'SUBNAMEZ')

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM