Below is the dput
of my dataset. I am trying to fill my dataset such that if there is NA
present in particular column of the year, then the NA
should be filled with mean
of other two years. For example, in the dataset below, Congo contains NA
for the "Economy.2015" column, so that NA
should be filled with mean from the columns "Economy.2016" and "Economy.2017".
dput
structure(list(Country = c("Angola", "Bosnia and Herzegovina",
"Congo (Kinshasa)", "Greece", "Indonesia", "Iraq", "Sierra Leone",
"Sudan", "Togo"), Region = c("Sub-Saharan Africa", "Central and Eastern Europe",
"Sub-Saharan Africa", "Western Europe", "Southeastern Asia",
"Middle East and Northern Africa", "Sub-Saharan Africa", "Sub-Saharan Africa",
"Sub-Saharan Africa"), Happiness.Rank.2015 = c(137L, 96L, 120L,
102L, 74L, 112L, 123L, 118L, 158L), Happiness.Score.2015 = c(4.033,
4.949, 4.517, 4.857, 5.399, 4.677, 4.507, 4.55, 2.839), Standard.Error.2015 = c(0.04758,
0.06913, 0.0368, 0.05062, 0.02596, 0.05232, 0.07068, 0.0674,
0.06727), Economy.2015 = c(0.75778, 0.83223, NA, 1.15406, 0.82827,
0.98549, 0.33024, 0.52107, 0.20868), Family.2015 = c(0.8604,
0.91916, 1.0012, 0.92933, 1.08708, 0.81889, 0.95571, 1.01404,
0.13995), Health.2015 = c(0.16683, 0.79081, 0.09806, 0.88213,
0.63793, 0.60237, NA, 0.36878, 0.28443), Freedom.2015 = c(0.10384,
0.09245, 0.22605, 0.07699, 0.46611, NA, 0.4084, 0.10081, 0.36453
), Trust.2015 = c(0.07122, 0.00227, 0.07625, 0.01397, NA, 0.13788,
0.08786, 0.1466, 0.10731), Generosity.2015 = c(0.12344, 0.24808,
0.24834, NA, 0.51535, 0.17922, 0.21488, 0.19062, 0.16681), Dystopia.Residual.2015 = c(1.94939,
2.06367, 2.86712, 1.80101, 1.86399, 1.95335, 2.51009, 2.20857,
1.56726), Region.2016 = c("Sub-Saharan Africa", "Central and Eastern Europe",
"Sub-Saharan Africa", "Western Europe", "Southeastern Asia",
"Middle East and Northern Africa", "Sub-Saharan Africa", "Sub-Saharan Africa",
"Sub-Saharan Africa"), Happiness.Rank.2016 = c(141L, 87L, 125L,
99L, 79L, 112L, 111L, 133L, 155L), Happiness.Score.2016 = c(3.866,
5.163, 4.272, 5.033, 5.314, 4.575, 4.635, 4.139, 3.303), Lower.CI.2016 = c(3.753,
5.063, 4.191, 4.935, 5.237, 4.446, 4.505, 3.928, 3.192), Upper.CI.2016 = c(3.979,
5.263, 4.353, 5.131, 5.391, 4.704, 4.765, 4.35, 3.414), Economy.2016 = c(0.84731,
0.93383, 0.05661, 1.24886, 0.95104, 1.07474, 0.36485, 0.63069,
0.28123), Family.2016 = c(0.66366, 0.64367, 0.80676, 0.75473,
0.87625, 0.59205, 0.628, 0.81928, NA), Health.2016 = c(0.04991,
0.70766, 0.188, 0.80029, 0.49374, 0.51076, NA, 0.29759, 0.24811
), Freedom.2016 = c(0.00589, 0.09511, 0.15602, 0.05822, 0.39237,
0.24856, 0.30685, NA, 0.34678), Trust.2016 = c(0.08434, NA, 0.06075,
0.04127, 0.00322, 0.13636, 0.08196, 0.10039, 0.11587), Generosity.2016 = c(0.12071,
0.29889, 0.25458, NA, 0.56521, 0.19589, 0.23897, 0.18077, 0.17517
), Dystopia.Residual.2016 = c(2.09459, 2.48406, 2.74924, 2.12944,
2.03171, 1.81657, 3.01402, 2.10995, 2.1354), Happiness.Rank.2017 = c(140L,
90L, 126L, 87L, 81L, 117L, 106L, 130L, 150L), Happiness.Score.2017 = c(3.79500007629395,
5.18200016021729, 4.28000020980835, 5.22700023651123, 5.26200008392334,
4.49700021743774, 4.70900011062622, 4.13899993896484, 3.49499988555908
), Whisker.high.2017 = c(3.95164193540812, 5.27633568674326,
4.35781083270907, 5.3252461694181, 5.35288859814405, 4.62259140968323,
4.85064333498478, 4.34574716508389, 3.59403811171651), whisker.low.2017 = c(3.63835821717978,
5.08766463369131, 4.20218958690763, 5.12875430360436, 5.17111156970263,
4.37140902519226, 4.56735688626766, 3.9322527128458, 3.39596165940166
), Economy.2017 = c(0.858428180217743, 0.982409417629242, 0.0921023488044739,
1.28948748111725, 0.995538592338562, 1.10271048545837, 0.36842092871666,
0.65951669216156, 0.305444717407227), Family.2017 = c(1.10441195964813,
1.0693359375, 1.22902345657349, 1.23941457271576, 1.27444469928741,
0.978613197803497, 0.984136044979095, 1.21400856971741, 0.431882530450821
), Health.2017 = c(0.0498686656355858, 0.705186307430267, 0.191407024860382,
0.810198903083801, 0.492345720529556, 0.501180469989777, 0.00556475389748812,
0.290920823812485, 0.247105568647385), Freedom.2017 = c(NA, 0.204403176903725,
0.235961347818375, 0.0957312509417534, 0.443323463201523, 0.288555532693863,
0.318697690963745, 0.0149958552792668, 0.38042613863945), Generosity.2017 = c(0.097926490008831,
0.328867495059967, 0.246455833315849, NA, 0.611704587936401,
0.19963726401329, 0.293040901422501, 0.182317450642586, 0.196896150708199
), Trust.2017 = c(0.0697203353047371, NA, 0.0602413564920425,
0.04328977689147, 0.0153171354904771, 0.107215754687786, 0.0710951760411263,
0.089847519993782, 0.0956650152802467), Dystopia.Residual.2017 = c(1.61448240280151,
1.89217257499695, 2.22495865821838, 1.74922156333923, 1.42947697639465,
1.31890726089478, 2.66845989227295, 1.68706583976746, 1.83722925186157
)), class = "data.frame", row.names = c(NA, -9L))
Structure of dataframe
Country Region Happiness.Rank.2015 Happiness.Score.2015
1 Angola Sub-Saharan Africa 137 4.033
2 Bosnia and Herzegovina Central and Eastern Europe 96 4.949
3 Congo (Kinshasa) Sub-Saharan Africa 120 4.517
4 Greece Western Europe 102 4.857
5 Indonesia Southeastern Asia 74 5.399
6 Iraq Middle East and Northern Africa 112 4.677
7 Sierra Leone Sub-Saharan Africa 123 4.507
8 Sudan Sub-Saharan Africa 118 4.550
9 Togo Sub-Saharan Africa 158 2.839
Standard.Error.2015 Economy.2015 Family.2015 Health.2015 Freedom.2015 Trust.2015 Generosity.2015
1 0.04758 0.75778 0.86040 0.16683 0.10384 0.07122 0.12344
2 0.06913 0.83223 0.91916 0.79081 0.09245 0.00227 0.24808
3 0.03680 NA 1.00120 0.09806 0.22605 0.07625 0.24834
4 0.05062 1.15406 0.92933 0.88213 0.07699 0.01397 NA
5 0.02596 0.82827 1.08708 0.63793 0.46611 NA 0.51535
6 0.05232 0.98549 0.81889 0.60237 NA 0.13788 0.17922
7 0.07068 0.33024 0.95571 NA 0.40840 0.08786 0.21488
8 0.06740 0.52107 1.01404 0.36878 0.10081 0.14660 0.19062
9 0.06727 0.20868 0.13995 0.28443 0.36453 0.10731 0.16681
Dystopia.Residual.2015 Region.2016 Happiness.Rank.2016 Happiness.Score.2016
1 1.94939 Sub-Saharan Africa 141 3.866
2 2.06367 Central and Eastern Europe 87 5.163
3 2.86712 Sub-Saharan Africa 125 4.272
4 1.80101 Western Europe 99 5.033
5 1.86399 Southeastern Asia 79 5.314
6 1.95335 Middle East and Northern Africa 112 4.575
7 2.51009 Sub-Saharan Africa 111 4.635
8 2.20857 Sub-Saharan Africa 133 4.139
9 1.56726 Sub-Saharan Africa 155 3.303
Lower.CI.2016 Upper.CI.2016 Economy.2016 Family.2016 Health.2016 Freedom.2016 Trust.2016
1 3.753 3.979 0.84731 0.66366 0.04991 0.00589 0.08434
2 5.063 5.263 0.93383 0.64367 0.70766 0.09511 NA
3 4.191 4.353 0.05661 0.80676 0.18800 0.15602 0.06075
4 4.935 5.131 1.24886 0.75473 0.80029 0.05822 0.04127
5 5.237 5.391 0.95104 0.87625 0.49374 0.39237 0.00322
6 4.446 4.704 1.07474 0.59205 0.51076 0.24856 0.13636
7 4.505 4.765 0.36485 0.62800 NA 0.30685 0.08196
8 3.928 4.350 0.63069 0.81928 0.29759 NA 0.10039
9 3.192 3.414 0.28123 NA 0.24811 0.34678 0.11587
Generosity.2016 Dystopia.Residual.2016 Happiness.Rank.2017 Happiness.Score.2017 Whisker.high.2017
1 0.12071 2.09459 140 3.795 3.951642
2 0.29889 2.48406 90 5.182 5.276336
3 0.25458 2.74924 126 4.280 4.357811
4 NA 2.12944 87 5.227 5.325246
5 0.56521 2.03171 81 5.262 5.352889
6 0.19589 1.81657 117 4.497 4.622591
7 0.23897 3.01402 106 4.709 4.850643
8 0.18077 2.10995 130 4.139 4.345747
9 0.17517 2.13540 150 3.495 3.594038
whisker.low.2017 Economy.2017 Family.2017 Health.2017 Freedom.2017 Generosity.2017 Trust.2017
1 3.638358 0.85842818 1.1044120 0.049868666 NA 0.09792649 0.06972034
2 5.087665 0.98240942 1.0693359 0.705186307 0.20440318 0.32886750 NA
3 4.202190 0.09210235 1.2290235 0.191407025 0.23596135 0.24645583 0.06024136
4 5.128754 1.28948748 1.2394146 0.810198903 0.09573125 NA 0.04328978
5 5.171112 0.99553859 1.2744447 0.492345721 0.44332346 0.61170459 0.01531714
6 4.371409 1.10271049 0.9786132 0.501180470 0.28855553 0.19963726 0.10721575
7 4.567357 0.36842093 0.9841360 0.005564754 0.31869769 0.29304090 0.07109518
8 3.932253 0.65951669 1.2140086 0.290920824 0.01499586 0.18231745 0.08984752
9 3.395962 0.30544472 0.4318825 0.247105569 0.38042614 0.19689615 0.09566502
Dystopia.Residual.2017
1 1.614482
2 1.892173
3 2.224959
4 1.749222
5 1.429477
6 1.318907
7 2.668460
8 1.687066
9 1.837229
Update#1: What I have tried
I have tried apply
function using the code suggested by @RAB. It gave me warning message as below
Code used
dt <- apply(df, 1, mean, na.rm=T)
Warning Message
1: In mean.default(newX[, i], ...) : argument is not numeric or logical: returning NA
str of dataframe
'data.frame': 9 obs. of 35 variables:
$ Country : chr "Angola" "Bosnia and Herzegovina" "Congo (Kinshasa)" "Greece" ...
$ Region : chr "Sub-Saharan Africa" "Central and Eastern Europe" "Sub-Saharan Africa" "Western Europe" ...
$ Happiness.Rank.2015 : int 137 96 120 102 74 112 123 118 158
$ Happiness.Score.2015 : num 4.03 4.95 4.52 4.86 5.4 ...
$ Standard.Error.2015 : num 0.0476 0.0691 0.0368 0.0506 0.026 ...
$ Economy.2015 : num 0.758 0.832 NA 1.154 0.828 ...
$ Family.2015 : num 0.86 0.919 1.001 0.929 1.087 ...
$ Health.2015 : num 0.1668 0.7908 0.0981 0.8821 0.6379 ...
$ Freedom.2015 : num 0.1038 0.0925 0.2261 0.077 0.4661 ...
$ Trust.2015 : num 0.07122 0.00227 0.07625 0.01397 NA ...
$ Generosity.2015 : num 0.123 0.248 0.248 NA 0.515 ...
$ Dystopia.Residual.2015: num 1.95 2.06 2.87 1.8 1.86 ...
$ Region.2016 : chr "Sub-Saharan Africa" "Central and Eastern Europe" "Sub-Saharan Africa" "Western Europe" ...
$ Happiness.Rank.2016 : int 141 87 125 99 79 112 111 133 155
$ Happiness.Score.2016 : num 3.87 5.16 4.27 5.03 5.31 ...
$ Lower.CI.2016 : num 3.75 5.06 4.19 4.93 5.24 ...
$ Upper.CI.2016 : num 3.98 5.26 4.35 5.13 5.39 ...
$ Economy.2016 : num 0.8473 0.9338 0.0566 1.2489 0.951 ...
$ Family.2016 : num 0.664 0.644 0.807 0.755 0.876 ...
$ Health.2016 : num 0.0499 0.7077 0.188 0.8003 0.4937 ...
$ Freedom.2016 : num 0.00589 0.09511 0.15602 0.05822 0.39237 ...
$ Trust.2016 : num 0.08434 NA 0.06075 0.04127 0.00322 ...
$ Generosity.2016 : num 0.121 0.299 0.255 NA 0.565 ...
$ Dystopia.Residual.2016: num 2.09 2.48 2.75 2.13 2.03 ...
$ Happiness.Rank.2017 : int 140 90 126 87 81 117 106 130 150
$ Happiness.Score.2017 : num 3.8 5.18 4.28 5.23 5.26 ...
$ Whisker.high.2017 : num 3.95 5.28 4.36 5.33 5.35 ...
$ whisker.low.2017 : num 3.64 5.09 4.2 5.13 5.17 ...
$ Economy.2017 : num 0.8584 0.9824 0.0921 1.2895 0.9955 ...
$ Family.2017 : num 1.1 1.07 1.23 1.24 1.27 ...
$ Health.2017 : num 0.0499 0.7052 0.1914 0.8102 0.4923 ...
$ Freedom.2017 : num NA 0.2044 0.236 0.0957 0.4433 ...
$ Generosity.2017 : num 0.0979 0.3289 0.2465 NA 0.6117 ...
$ Trust.2017 : num 0.0697 NA 0.0602 0.0433 0.0153 ...
$ Dystopia.Residual.2017: num 1.61 1.89 2.22 1.75 1.43 ...
Note: I am new to R, please provide an explanation along with the code.
Your data needs to be numeric for this to work, so step 1 will be to filter out only the numeric data (we will put the other stuff back in later)
You will need to replace "yourdata" with your dataframe name
Step 1: filter for only numeric
df <- Filter(is.numeric, yourdata)
Step 2: get the means
mns <- apply(df, 1, mean, na.rm=T) # this gets the mean of each row
Step 3: find the indexes of the NA values
nas <- as.data.frame(which(is.na(df), arr.ind = T))
# the data frame makes it easier to extract the row info for later
Step 4: substitute the NA values with the corresponding mean
df[which(is.na(df), arr.ind = T)] <- mns[nas$row]
Step 5: combine the non-numeric columns with the new columns
new_df <- cbind(Filter(Negate(is.numeric), yourdata), df)
Edit:
I was bored, so hear's a function for you:
replace_missing <- function(df, groups){
cols <- names(df)
df_char <- Filter(Negate(is.numeric), df)
df_num <- Filter(is.numeric, df)
for(gg in 1:length(groups)){
tmp <- df_num[, grep(groups[gg], names(df_num))]
mns <- apply(tmp, 1, mean, na.rm=T)
nas <- as.data.frame(which(is.na(tmp), arr.ind = T))
if (nrow(nas) > 0){
tmp[which(is.na(tmp), arr.ind = T)] <- mns[nas$row]
}
df_char <- cbind(df_char, tmp)
}
new_df <- cbind(df_char, df[, setdiff(names(df), names(df_char))])
new_df <- new_df[, cols]
}
new_data <- replace_missing(yourdata, groups = c("Happiness.Rank", "Happiness.Score",
"Family", "Economy"))
You can add as many as you want to the groups
field
Here is a fairly straight-foward tidyverse
solution; the key here is to reshape data from wide to long, then "suitably" replace NA
values before transforming data back to wide. I give (some) explanations at the end but I encourage you to execute the code line-by-line to understand what every step does.
library(tidyverse)
df.new <- df %>%
gather(key, val, -Country, -Region, -Region.2016) %>%
separate(key, c("what", "when"), sep = "\\.(?=\\d)", remove = FALSE) %>%
group_by(Country, what) %>%
mutate(val = replace(val, is.na(val), mean(val, na.rm = TRUE))) %>%
ungroup() %>%
select(-what, -when) %>%
spread(key, val)
df.new
## A tibble: 9 x 35
# Country Region Region.2016 Dystopia.Residu… Dystopia.Residu… Dystopia.Residu…
# <chr> <chr> <chr> <dbl> <dbl> <dbl>
#1 Angola Sub-S… Sub-Sahara… 1.95 2.09 1.61
#2 Bosnia… Centr… Central an… 2.06 2.48 1.89
#3 Congo … Sub-S… Sub-Sahara… 2.87 2.75 2.22
#4 Greece Weste… Western Eu… 1.80 2.13 1.75
#5 Indone… South… Southeaste… 1.86 2.03 1.43
#6 Iraq Middl… Middle Eas… 1.95 1.82 1.32
#7 Sierra… Sub-S… Sub-Sahara… 2.51 3.01 2.67
#8 Sudan Sub-S… Sub-Sahara… 2.21 2.11 1.69
#9 Togo Sub-S… Sub-Sahara… 1.57 2.14 1.84
## ... with 29 more variables: Economy.2015 <dbl>, Economy.2016 <dbl>,
## Economy.2017 <dbl>, Family.2015 <dbl>, Family.2016 <dbl>,
## Family.2017 <dbl>, Freedom.2015 <dbl>, Freedom.2016 <dbl>,
## Freedom.2017 <dbl>, Generosity.2015 <dbl>, Generosity.2016 <dbl>,
## Generosity.2017 <dbl>, Happiness.Rank.2015 <dbl>,
## Happiness.Rank.2016 <dbl>, Happiness.Rank.2017 <dbl>,
## Happiness.Score.2015 <dbl>, Happiness.Score.2016 <dbl>,
## Happiness.Score.2017 <dbl>, Health.2015 <dbl>, Health.2016 <dbl>,
## Health.2017 <dbl>, Lower.CI.2016 <dbl>, Standard.Error.2015 <dbl>,
## Trust.2015 <dbl>, Trust.2016 <dbl>, Trust.2017 <dbl>, Upper.CI.2016 <dbl>,
## Whisker.high.2017 <dbl>, whisker.low.2017 <dbl>
Explanation:
Country
, Region
and Region.2016
as they are. All other column names are given in new column key
with values in val
. key
entries such as "Happiness.Score.2016"
into "Happiness.Score" (column
what ) and
"2016" (column
when`). Country
and what
. NA
s per Country
and what
by the mean value across all years. ungroup
and remove the what
and when
columns before Mind you, it might actually be a lot easier (and more in line with "tidy" data) to keep your data in long format; but that's just my opinion.
Let's check for Country == "Congo"
df.new %>% filter(str_detect(Country, "Congo")) %>% select(contains("Economy"))
## A tibble: 1 x 3
# Economy.2015 Economy.2016 Economy.2017
# <dbl> <dbl> <dbl>
#1 0.0744 0.0566 0.0921
and compare with the original data
df %>% filter(str_detect(Country, "Congo")) %>% select(contains("Economy"))
# Economy.2015 Economy.2016 Economy.2017
#1 NA 0.05661 0.09210235
So here 0.0744 = 1/2 * (0.05661 + 0.09210235)
.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.