简体   繁体   中英

Replace a set number of NA's across multiple columns with a row mean

Working through survey data and I need to perform some imputing for occasional NA's popping up within the different questions of a measure. I want to replace NA with the row mean, but only on the condition that there are no more than 2 NA's across the row. Any tips on how to achieve this would be amazing.

I've tried using the code below (with some example data), but that doesn't allow me to control for how many NA's in a row that are acceptable.

data <- data.frame(
  var_1 = c(2,3,NA,2,3,5,NA,3),
  var_2 = c(3,4,2,3,1,3,NA,2),
  var_3 = c(NA,3,2,5,4,2,NA,2),
  var_4 = c(NA,3,NA,4,1,2,NA,1),
  var_5 = c(NA,4,2,3,2,3,NA,2),
  var_6 = c(4,2,1,NA,2,5,NA,3),
  var_7 = c(3,2,1,2,2,4,NA,3))

data_fix <- data %>%
  mutate(var_1 = ifelse(is.na(var_1),rowMeans(data[row_number(),], na.rm = T),var_1),
         var_2 = ifelse(is.na(var_2),rowMeans(data[row_number(),], na.rm = T),var_2),
         var_3 = ifelse(is.na(var_3),rowMeans(data[row_number(),], na.rm = T),var_3),
         var_4 = ifelse(is.na(var_4),rowMeans(data[row_number(),], na.rm = T),var_4),
         var_5 = ifelse(is.na(var_5),rowMeans(data[row_number(),], na.rm = T),var_5),
         var_6 = ifelse(is.na(var_6),rowMeans(data[row_number(),], na.rm = T),var_6),
         var_7 = ifelse(is.na(var_7),rowMeans(data[row_number(),], na.rm = T),var_7))

Here's an option:

rmeans <- rowMeans(data, na.rm = TRUE)
rmeans[rowSums(is.na(data)) > 2] <- NA
data2 <- data
data2[] <- rmeans
data[is.na(data)] <- data2[is.na(data)]
data
#>   var_1 var_2 var_3 var_4 var_5    var_6 var_7
#> 1   2.0     3    NA    NA    NA 4.000000     3
#> 2   3.0     4     3   3.0     4 2.000000     2
#> 3   1.6     2     2   1.6     2 1.000000     1
#> 4   2.0     3     5   4.0     3 3.166667     2
#> 5   3.0     1     4   1.0     2 2.000000     2
#> 6   5.0     3     2   2.0     3 5.000000     4
#> 7    NA    NA    NA    NA    NA       NA    NA
#> 8   3.0     2     2   1.0     2 3.000000     3

Created on 2022-04-20 by the reprex package (v2.0.1)

A dplyr solution:

library(dplyr)

data %>%
  mutate(na.count = rowSums(is.na(data)),
         row.mean = rowMeans(data, na.rm = TRUE),
         across(var_1:var_7, ~ ifelse(is.na(.x) & na.count <= 2, row.mean, .x))) %>%
  select(-c(na.count, row.mean))

  var_1 var_2 var_3 var_4 var_5    var_6 var_7
1   2.0     3    NA    NA    NA 4.000000     3
2   3.0     4     3   3.0     4 2.000000     2
3   1.6     2     2   1.6     2 1.000000     1
4   2.0     3     5   4.0     3 3.166667     2
5   3.0     1     4   1.0     2 2.000000     2
6   5.0     3     2   2.0     3 5.000000     4
7    NA    NA    NA    NA    NA       NA    NA
8   3.0     2     2   1.0     2 3.000000     3

Its base equivalent is:

na.count <- rowSums(is.na(data))
row.mean <- rowMeans(data, na.rm = TRUE)
data2 <- data
data2[] <- lapply(data, \(x) ifelse(is.na(x) & na.count <= 2, row.mean, x))
data2

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM