繁体   English   中英

数据框中的所有列 NA 值都填充了 R 中的中值

[英]All the column NA values in a dataframe fill with median values in R

我需要用数据框中每列的中值填充所有数字列的空值。 我做了以下代码。

median_forNumericalNulls <- function(dataframe){
  
  nums <- unlist(lapply(dataframe, is.numeric))  
  
  df_num <- dataframe[ , nums]
  
  df_num[] <- lapply(df_num, function(x) { 
    x[is.na(x)] <- median(x, na.rm = TRUE)
    x
  })      

  return(dataframe)
  
}

median_forNumericalNulls(A)

A 是父表,它由数值变量和分类变量组成。 如何用函数median_forNumericalNulls的输出替换'A'数据帧的列?

有没有更好的方法可以让我们做同样的事情?

可能我们需要更改函数以直接对列进行子集化并更新列,而不是创建另一个对象然后更新

median_forNumericalNulls <- function(dataframe){
  
  nums <- unlist(lapply(dataframe, is.numeric))  
  
  df_num <- dataframe[ , nums]
  
  dataframe[nums] <- lapply(dataframe[nums], function(x) { 
    x[is.na(x)] <- median(x, na.rm = TRUE)
    x
  })      
  dataframe
  
}

-测试

A <- median_forNumericalNulls(A)

此外,这可以通过na.aggregate以紧凑的方式完成

library(zoo)
A <- na.aggregate(A, FUN = median)

或者使用tidyverse

library(dplyr)
A <- A %>%
   mutate(across(where(is.numeric), 
         ~ replace(., is.na(.), median(., na.rm = TRUE))))

这是另一种方法,您可以这样做:示例:

librara(dplyr)
iris1 <- iris %>% 
  select(1, 2, 5)
head(iris1, 10) %>% 
  as_tibble() %>% 
  mutate(across(where(is.numeric), ~ifelse(.<= 3, NA, .))) %>% 
  mutate(across(where(is.numeric), ~ifelse(is.na(.), median(.,na.rm = TRUE), .)))
   Sepal.Length Sepal.Width Species
          <dbl>       <dbl> <fct>  
 1          5.1         3.5 setosa 
 2          4.9         3.4 setosa 
 3          4.7         3.2 setosa 
 4          4.6         3.1 setosa 
 5          5           3.6 setosa 
 6          5.4         3.9 setosa 
 7          4.6         3.4 setosa 
 8          5           3.4 setosa 
 9          4.4         3.4 setosa 
10          4.9         3.1 setosa 

基础 R 解决方案:

# Function to deteremine data.frame vector indices that are numeric:
# resolve_num_cols => function() 
resolve_num_cols <- function(df){
  # Store a vector of numeric column names: 
  # num_cols => logical vector
  num_cols <- which(
    vapply(
      df, 
      is.numeric, 
      logical(1),
      USE.NAMES = FALSE
    )
  )
  # Explicitly define the returned object: logical vector => env
  return(num_cols)
}

# Function to impute median values for each numeric vector in data.frame
# impute_median_vals_in_df => function()
impute_median_vals_in_df <- function(df, num_col_idx){
  # Replace the na. values in each numeric vector: df => data.frame 
  df[,num_col_idx] <- lapply(
    num_col_idx, 
    function(col_idx){
      df[,col_idx] <- ifelse(
        is.na(df[,col_idx]),
        median(df[,col_idx], na.rm = TRUE),
        df[,col_idx]
      )
    }
  )
  # Return the data.frame object: data.frame => env
  return(df)
}

# Apply the function to resolve the numeric vectors in data.frame: 
# num_cols => integer vector
num_cols <- resolve_num_cols(df1)

# Apply the function: clean_df => data.frame
clean_df <- impute_median_vals_in_df(df1, num_cols)

使用的数据:

# Import data: df1 => data.frame
df1 <- structure(list(mpg = c(NA, 21, NA, 21.4, 18.7, 18.1, 14.3, 24.4, 
22.8, 19.2, NA, 16.4, 17.3, NA, NA, 10.4, 14.7, 32.4, 30.4, 33.9, 
21.5, NA, 15.2, 13.3, 19.2, 27.3, 26, NA, NA, 19.7, 15, 21.4), 
cyl = c(NA, 6, NA, 6, 8, NA, NA, 4, 4, 6, 6, 8, 8, 8, 8, 
8, 8, NA, NA, 4, 4, 8, 8, 8, 8, 4, 4, 4, 8, NA, NA, 4), disp = c(160, 
160, 108, 258, 360, 225, 360, 146.7, 140.8, 167.6, 167.6, 
NA, NA, 275.8, NA, 460, 440, 78.7, 75.7, 71.1, NA, 318, NA, 
350, 400, NA, NA, 95.1, 351, NA, 301, 121), hp = c(110, 110, 
93, 110, NA, 105, 245, 62, 95, 123, 123, 180, NA, 180, 205, 
215, NA, 66, NA, 65, 97, NA, NA, 245, 175, 66, 91, NA, 264, 
175, NA, 109), drat = c(3.9, 3.9, 3.85, 3.08, 3.15, 2.76, 
3.21, NA, 3.92, 3.92, 3.92, 3.07, NA, 3.07, 2.93, 3, 3.23, 
4.08, NA, 4.22, NA, 2.76, 3.15, 3.73, 3.08, NA, NA, 3.77, 
4.22, 3.62, NA, NA), wt = c(2.62, 2.875, 2.32, 3.215, 3.44, 
3.46, 3.57, NA, 3.15, 3.44, 3.44, 4.07, NA, NA, 5.25, 5.424, 
5.345, 2.2, 1.615, 1.835, NA, NA, 3.435, 3.84, NA, NA, NA, 
1.513, 3.17, 2.77, 3.57, 2.78), qsec = c(16.46, 17.02, 18.61, 
19.44, NA, NA, NA, 20, NA, 18.3, 18.9, 17.4, 17.6, 18, NA, 
17.82, NA, 19.47, 18.52, 19.9, NA, 16.87, NA, 15.41, 17.05, 
18.9, 16.7, 16.9, 14.5, 15.5, 14.6, 18.6), vs = c(0, NA, 
1, 1, NA, NA, 0, NA, 1, NA, 1, 0, 0, 0, 0, 0, 0, 1, NA, 1, 
1, 0, 0, 0, NA, 1, NA, 1, 0, 0, 0, 1), am = c(NA, NA, NA, 
0, NA, 0, NA, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, NA, 
0, 0, 0, NA, 1, 1, 1, 1, 1, 1, 1), gear = c(4, 4, 4, 3, NA, 
3, 3, NA, NA, 4, 4, 3, 3, 3, 3, 3, NA, 4, 4, 4, 3, 3, 3, 
3, 3, 4, 5, NA, NA, NA, NA, 4), carb = c(4, 4, 1, 1, 2, 1, 
4, NA, NA, 4, NA, 3, NA, 3, NA, 4, 4, 1, 2, 1, 1, 2, 2, 4, 
NA, NA, 2, 2, NA, 6, 8, 2)), row.names = c("Mazda RX4", "Mazda RX4 Wag", 
"Datsun 710", "Hornet 4 Drive", "Hornet Sportabout", "Valiant", 
"Duster 360", "Merc 240D", "Merc 230", "Merc 280", "Merc 280C", 
"Merc 450SE", "Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood", 
"Lincoln Continental", "Chrysler Imperial", "Fiat 128", "Honda Civic", 
"Toyota Corolla", "Toyota Corona", "Dodge Challenger", "AMC Javelin", 
"Camaro Z28", "Pontiac Firebird", "Fiat X1-9", "Porsche 914-2", 
"Lotus Europa", "Ford Pantera L", "Ferrari Dino", "Maserati Bora", 
"Volvo 142E"), class = "data.frame")

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM