I have a data set that I need to tidy, but I cannot figure out how to do it in an efficient way. The data set has multiple values per cell in multiple columns, and I want to replace the multiple values per cell with one value, preferably the mean of the multiple values.
I am thinking that I need to first separate the columns with multiple values per cell into multiple columns (so that there is only one value per cell) and then calculate the mean and get that mean into a new column replacing the original column (and the extra columns I created while separating). How would I do this in an efficient way? Or what is a better solution?
My data has 663 rows and 55 columns and many of the columns have multiple values per cell. Here is an example of parts my data:
X h_max h_min seed_wght ell_light_uk ell_moist_uk ell_pH_uk ell_N ell_S
1 Achillea millefolium 45 8 0.16;0.2 7;8;7 5;5;4 6x;6;NA 4;5;5 1;1;1
2 Achillea ptarmica 60 20 0.31;0.2 7;8;7 8;7;8 4;5;4 3;2;4 0;0;0
3 Aegopodium podagraria 100 40 1.99;2.2 5;6x;6 6;5;6 6;7;7 8;8;7 0;0;1
4 Agrimonia eupatoria 60 30 2.96;13 7;7;7 4;4;4 8;7;8 4;4;4 0;0;1
5 Agrostis canina 70 10 0.05;0.06 7;7;9 7;9;9 3;3;3 2;3;2 0;0;0
6 Agrostis capillaris 70 10 0.07;0.06;0.075 7;6;7 4x;NA;5 4;4;4 4;4;4 0;0;0
7 Agrostis gigantea 70 10 0.08;0.09 7;7;7 6;6;8 6;7;7 6;6;7 0;1;0
8 Agrostis stolonifera 100 15 0.07;0.02 7;7;8 6;7~;7 7;6x;NA 5;6;5 1;0;1
9 Ajuga chamaepitys 20 5 1.6602;1.16 7;7;8 4;3;4 8;9;8 2;2;4 0;0;0
10 Ajuga genevensis <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
11 Ajuga pyramidalis 30 10 1.373;1.6 7;7;7 5;5;5 1;1;5 1;2;2 0;0;0
12 Ajuga reptans 30 10 1.4 5;5;6 6;7;6 6;5;6 5;6;6 0;0;0
13 Alchemilla glabra 60 <NA> 0.51 7;6;NA 5;NA;6 6;6;NA 4;NA;6 0;0;NA
14 Alchemilla glaucescens 20 <NA> 0.487;0.46 7;7;7 5;5;5 4;4;7 3;5;4 0;0;0
15 Alchemilla monticola 40 <NA> 0.689 6;7;6 5;4;5 6;6;6 4;4;4 0;0;0
16 Alchemilla vulgaris <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
17 Alliaria petiolata 120 20 2.25;3.4;3.03 5;5;5x 5;6;5 7;7;7 8;8;9 0;0;0
18 Allium angulosum <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
19 Allium lusitanicum <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
20 Allium oleraceum 80 25 0.881 6;7;7 3;5;4 7;7;7 4;4;4 0;0;0
I would be very grateful for any help!
You can write a function which splits the value on separator and calculates it's mean
split_and_mean <- function(x) {
sapply(strsplit(x, ";"), function(x) mean(as.numeric(x)))
}
and then apply it on selected columns
df[4:9] <- lapply(df[4:9], split_and_mean)
df
# X h_max h_min seed_wght ell_light_uk ell_moist_uk ell_pH_uk ell_N ell_S
#1 Achilleamillefolium 45 8 0.1800 7.33 4.67 NA 4.67 1.000
#2 Achilleaptarmica 60 20 0.2550 7.33 7.67 4.33 3.00 0.000
#3 Aegopodiumpodagraria 100 40 2.0950 NA 5.67 6.67 7.67 0.333
#4 Agrimoniaeupatoria 60 30 7.9800 7.00 4.00 7.67 4.00 0.333
#5 Agrostiscanina 70 10 0.0550 7.67 8.33 3.00 2.33 0.000
#6 Agrostiscapillaris 70 10 0.0683 6.67 NA 4.00 4.00 0.000
#7 Agrostisgigantea 70 10 0.0850 7.00 6.67 6.67 6.33 0.333
#8 Agrostisstolonifera 100 15 0.0450 7.33 NA NA 5.33 0.667
#9 Ajugachamaepitys 20 5 1.4101 7.33 3.67 8.33 2.67 0.000
#10 Ajugagenevensis <NA> <NA> NA NA NA NA NA NA
#11 Ajugapyramidalis 30 10 1.4865 7.00 5.00 2.33 1.67 0.000
#12 Ajugareptans 30 10 1.4000 5.33 6.33 5.67 5.67 0.000
#13 Alchemillaglabra 60 <NA> 0.5100 NA NA NA NA NA
#14 Alchemillaglaucescens 20 <NA> 0.4735 7.00 5.00 5.00 4.00 0.000
#15 Alchemillamonticola 40 <NA> 0.6890 6.33 4.67 6.00 4.00 0.000
#16 Alchemillavulgaris <NA> <NA> NA NA NA NA NA NA
#17 Alliariapetiolata 120 20 2.8933 NA 5.33 7.00 8.33 0.000
#18 Alliumangulosum <NA> <NA> NA NA NA NA NA NA
#19 Alliumlusitanicum <NA> <NA> NA NA NA NA NA NA
#20 Alliumoleraceum 80 25 0.8810 6.67 4.00 7.00 4.00 0.000
This returns warnings because there are columns with NA
values that cannot be split but it is safe to ignore the warnings.
Similar concept using tidyverse
, convert data to long format, split on ";"
, take mean
and get data in wide format again.
library(tidyverse)
df %>%
pivot_longer(cols = 4:9) %>%
mutate(value = map_dbl(str_split(value, ";"), ~mean(as.numeric(.x)))) %>%
pivot_wider()
data
Used this data to test the solutions. Make sure that you have column as characters and not factors before applying the solution.
df <- structure(list(X = c("Achilleamillefolium", "Achilleaptarmica",
"Aegopodiumpodagraria", "Agrimoniaeupatoria", "Agrostiscanina",
"Agrostiscapillaris", "Agrostisgigantea", "Agrostisstolonifera",
"Ajugachamaepitys", "Ajugagenevensis", "Ajugapyramidalis", "Ajugareptans",
"Alchemillaglabra", "Alchemillaglaucescens", "Alchemillamonticola",
"Alchemillavulgaris", "Alliariapetiolata", "Alliumangulosum",
"Alliumlusitanicum", "Alliumoleraceum"), h_max = c("45", "60",
"100", "60", "70", "70", "70", "100", "20", "<NA>", "30", "30",
"60", "20", "40", "<NA>", "120", "<NA>", "<NA>", "80"), h_min = c("8",
"20", "40", "30", "10", "10", "10", "15", "5", "<NA>", "10",
"10", "<NA>", "<NA>", "<NA>", "<NA>", "20", "<NA>", "<NA>", "25"
), seed_wght = c("0.16;0.2", "0.31;0.2", "1.99;2.2", "2.96;13",
"0.05;0.06", "0.07;0.06;0.075", "0.08;0.09", "0.07;0.02", "1.6602;1.16",
"<NA>", "1.373;1.6", "1.4", "0.51", "0.487;0.46", "0.689", "<NA>",
"2.25;3.4;3.03", "<NA>", "<NA>", "0.881"), ell_light_uk = c("7;8;7",
"7;8;7", "5;6x;6", "7;7;7", "7;7;9", "7;6;7", "7;7;7", "7;7;8",
"7;7;8", "<NA>", "7;7;7", "5;5;6", "7;6;NA", "7;7;7", "6;7;6",
"<NA>", "5;5;5x", "<NA>", "<NA>", "6;7;7"), ell_moist_uk = c("5;5;4",
"8;7;8", "6;5;6", "4;4;4", "7;9;9", "4x;NA;5", "6;6;8", "6;7~;7",
"4;3;4", "<NA>", "5;5;5", "6;7;6", "5;NA;6", "5;5;5", "5;4;5",
"<NA>", "5;6;5", "<NA>", "<NA>", "3;5;4"), ell_pH_uk = c("6x;6;NA",
"4;5;4", "6;7;7", "8;7;8", "3;3;3", "4;4;4", "6;7;7", "7;6x;NA",
"8;9;8", "<NA>", "1;1;5", "6;5;6", "6;6;NA", "4;4;7", "6;6;6",
"<NA>", "7;7;7", "<NA>", "<NA>", "7;7;7"), ell_N = c("4;5;5",
"3;2;4", "8;8;7", "4;4;4", "2;3;2", "4;4;4", "6;6;7", "5;6;5",
"2;2;4", "<NA>", "1;2;2", "5;6;6", "4;NA;6", "3;5;4", "4;4;4",
"<NA>", "8;8;9", "<NA>", "<NA>", "4;4;4"), ell_S = c("1;1;1",
"0;0;0", "0;0;1", "0;0;1", "0;0;0", "0;0;0", "0;1;0", "1;0;1",
"0;0;0", "<NA>", "0;0;0", "0;0;0", "0;0;NA", "0;0;0", "0;0;0",
"<NA>", "0;0;0", "<NA>", "<NA>", "0;0;0")), row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20"), class = "data.frame")
With tidyverse
, we can do
library(dplyr)
library(purrr)
df %>%
mutate_at(4:9, ~ strsplit(., ';') %>%
map_dbl(~ mean(as.numeric(.x), na.rm = TRUE)))
df <- structure(list(X = c("Achilleamillefolium", "Achilleaptarmica",
"Aegopodiumpodagraria", "Agrimoniaeupatoria", "Agrostiscanina",
"Agrostiscapillaris", "Agrostisgigantea", "Agrostisstolonifera",
"Ajugachamaepitys", "Ajugagenevensis", "Ajugapyramidalis", "Ajugareptans",
"Alchemillaglabra", "Alchemillaglaucescens", "Alchemillamonticola",
"Alchemillavulgaris", "Alliariapetiolata", "Alliumangulosum",
"Alliumlusitanicum", "Alliumoleraceum"), h_max = c("45", "60",
"100", "60", "70", "70", "70", "100", "20", "<NA>", "30", "30",
"60", "20", "40", "<NA>", "120", "<NA>", "<NA>", "80"), h_min = c("8",
"20", "40", "30", "10", "10", "10", "15", "5", "<NA>", "10",
"10", "<NA>", "<NA>", "<NA>", "<NA>", "20", "<NA>", "<NA>", "25"
), seed_wght = c("0.16;0.2", "0.31;0.2", "1.99;2.2", "2.96;13",
"0.05;0.06", "0.07;0.06;0.075", "0.08;0.09", "0.07;0.02", "1.6602;1.16",
"<NA>", "1.373;1.6", "1.4", "0.51", "0.487;0.46", "0.689", "<NA>",
"2.25;3.4;3.03", "<NA>", "<NA>", "0.881"), ell_light_uk = c("7;8;7",
"7;8;7", "5;6x;6", "7;7;7", "7;7;9", "7;6;7", "7;7;7", "7;7;8",
"7;7;8", "<NA>", "7;7;7", "5;5;6", "7;6;NA", "7;7;7", "6;7;6",
"<NA>", "5;5;5x", "<NA>", "<NA>", "6;7;7"), ell_moist_uk = c("5;5;4",
"8;7;8", "6;5;6", "4;4;4", "7;9;9", "4x;NA;5", "6;6;8", "6;7~;7",
"4;3;4", "<NA>", "5;5;5", "6;7;6", "5;NA;6", "5;5;5", "5;4;5",
"<NA>", "5;6;5", "<NA>", "<NA>", "3;5;4"), ell_pH_uk = c("6x;6;NA",
"4;5;4", "6;7;7", "8;7;8", "3;3;3", "4;4;4", "6;7;7", "7;6x;NA",
"8;9;8", "<NA>", "1;1;5", "6;5;6", "6;6;NA", "4;4;7", "6;6;6",
"<NA>", "7;7;7", "<NA>", "<NA>", "7;7;7"), ell_N = c("4;5;5",
"3;2;4", "8;8;7", "4;4;4", "2;3;2", "4;4;4", "6;6;7", "5;6;5",
"2;2;4", "<NA>", "1;2;2", "5;6;6", "4;NA;6", "3;5;4", "4;4;4",
"<NA>", "8;8;9", "<NA>", "<NA>", "4;4;4"), ell_S = c("1;1;1",
"0;0;0", "0;0;1", "0;0;1", "0;0;0", "0;0;0", "0;1;0", "1;0;1",
"0;0;0", "<NA>", "0;0;0", "0;0;0", "0;0;NA", "0;0;0", "0;0;0",
"<NA>", "0;0;0", "<NA>", "<NA>", "0;0;0")), row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20"), class = "data.frame")
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.