I have the following data set that I am trying to spread.
#create df
df <- structure(list(file_number = c("3098129", "3096451", "3096774",
"3095276", "3095464", "3096846", "3097132", "3096355", "3096951",
"3096328", "3095441", "3096325", "3094412", "3096366", "3096372",
"3096507", "3098510", "3096335", "3096403", "3094343", "3096941",
"3096419", "3094431", "3096495", "3094647", "3094487", "3094947",
"3094398", "3094386", "3094367", "3097480", "3096425", "3095193",
"3095839a", "3097197", "3098453", "3098549", "3098428", "3096427",
"3096895", "3096434", "3094835", "3096312", "3094517", "3094372",
"3096387", "3096480", "3098504", "3096338", "3094615", "3096382",
"3096638", "3096750", "3096418", "3094734", "3098503", "3096311",
"3097197", "3094353", "3098442", "3097111", "3097325", "3096531",
"3096405", "3096301", "3096692", "3096495", "3098406", "3098422",
"3096315", "3096951", "3094491", "3096304", "3098416", "3096332",
"3098404", "3098419", "3095225", "3094404", "3096374", "3098411",
"3098556", "3096398", "3094421b", "3098477", "3094369b", "3098463",
"3096893", "3098514", "3098477", "3098465", "3094560", "3098409",
"3096434", "3097557", "3095061", "3098419", "3096404", "3095441",
"3096537", "3098503", "3098400", "3097808", "3096389b", "3098446",
"3096330", "3095533", "3094421a", "3094339", "3095578", "3094404",
"3098552", "3098514", "3096630", "3096941", "3097027", "3096322",
"3096514", "3098484", "3097038", "3096672", "3098483", "3094373",
"3096774", "3096677", "3096408", "3096664", "3096365", "3096491",
"3096820", "3096514", "3096556", "3096292", "3096495", "3094781",
"3094344", "3094487", "3094690", "3098504", "3096503"), reader = c("aa",
"aa", "aa", "aa", "aa", "aa", "aa", "aa", "aa", "aa", "aa", "aa",
"aa", "aa", "aa", "aa", "aa", "aa", "aa", "aa", "ae", "ae", "ae",
"ae", "ae", "ae", "ae", "ae", "ae", "ae", "ae", "ae", "ae", "ae",
"ae", "ae", "ae", "ae", "ae", "ae", "db", "db", "db", "db", "db",
"db", "db", "db", "db", "db", "db", "db", "db", "db", "db", "db",
"db", "db", "db", "db", "dl", "dl", "dl", "dl", "dl", "dl", "dl",
"dl", "dl", "dl", "dl", "dl", "dl", "dl", "dl", "dl", "dl", "dl",
"dl", "dl", "mk", "mk", "mk", "mk", "mk", "mk", "mk", "mk", "mk",
"mk", "mk", "mk", "mk", "mk", "mk", "mk", "mk", "mk", "mk", "mk",
"mm", "mm", "mm", "mm", "mm", "mm", "mm", "mm", "mm", "mm", "mm",
"mm", "mm", "mm", "mm", "mm", "mm", "mm", "mm", "mm", "np", "np",
"np", "np", "np", "np", "np", "np", "np", "np", "np", "np", "np",
"np", "np", "np", "np", "np", "np", "np"), event = c("fail",
"fail", "fail", "fail", "pass", "fail", "fail", "pass", "fail",
"fail", "pass", "pass", "pass", "fail", "fail", "pass", "pass",
"fail", "pass", "pass", "pass", "pass", "pass", "pass", "fail",
"fail", "pass", "pass", "fail", "pass", "pass", "pass", "pass",
"pass", "fail", "pass", "fail", "fail", "fail", "pass", "pass",
"pass", "fail", "pass", "pass", "fail", "pass", "fail", "fail",
"pass", "fail", "fail", "pass", "fail", "pass", "fail", "pass",
"fail", "fail", "fail", "fail", "pass", "pass", "fail", "pass",
"pass", "fail", "pass", "fail", "pass", "pass", "fail", "pass",
"fail", "fail", "pass", "pass", "fail", "pass", "pass", "fail",
"pass", "fail", "pass", "fail", "pass", "pass", "pass", "pass",
"fail", "pass", "pass", "fail", "pass", "fail", "pass", "fail",
"pass", "pass", "fail", "pass", "pass", "fail", "pass", "pass",
"fail", "pass", "fail", "fail", "fail", "pass", "pass", "pass",
"fail", "fail", "fail", "fail", "fail", "fail", "fail", "fail",
"fail", "pass", "fail", "fail", "fail", "pass", "pass", "pass",
"pass", "fail", "pass", "pass", "fail", "fail", "pass", "pass",
"fail", "fail", "fail")), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -140L))
>head(df)
file_number reader event
3098129 aa fail
3096451 aa fail
3096774 aa fail
3095276 aa fail
But when I run the following tidyr::pivot_wider
I get the output <S3: vctrs_list_of>
. I think this has to do with having multiple values of the same type in the names_from
column.
df %>%
tidyr::pivot_wider(id_cols = file_number, names_from = reader, values_from = event)
id aa ae
3098129 <S3: vctrs_list_of> <S3: vctrs_list_of>
3096451 <S3: vctrs_list_of> <S3: vctrs_list_of>
Along with the following warning:
Values in `event` are not uniquely identified; output will contain list-cols.
* Use `values_fn = list(event = list)` to suppress this warning.
* Use `values_fn = list(event = length)` to identify where the duplicates arise
* Use `values_fn = list(event = summary_fun)` to summarise duplicates
My question is: Why does pivot_wider output S3 vector lists?
EDIT -added better reproducible example. -redefined question.
I am able to fix the issue by using the tidyr::unnest
function on the S3 vector objects.
df %>% ungroup() %>% pivot_wider(names_from = reader, values_from = event) %>% tidyr::unnest()
id aa bb
1 0 0
2 0 1
3 1 0
4 1 1
5 0 1
NOTE: all variables are factors now
In general, if we have the names_from
column without a sequence identifier for duplicate rows, this can happen
library(tidyr)
library(dplyr)
df %>%
pivot_wider(names_from = reader, values_from = event)
# A tibble: 124 x 8
# file_number aa ae db dl mk mm np
# <chr> <list<chr>> <list<chr>> <list<chr>> <list<chr>> <list<chr>> <list<chr>> <list<chr>>
# 1 3098129 [1] [0] [0] [0] [0] [0] [0]
# 2 3096451 [1] [0] [0] [0] [0] [0] [0]
# 3 3096774 [1] [0] [0] [0] [0] [0] [1]
# 4 3095276 [1] [0] [0] [0] [0] [0] [0]
# 5 3095464 [1] [0] [0] [0] [0] [0] [0]
# 6 3096846 [1] [0] [0] [0] [0] [0] [0]
# 7 3097132 [1] [0] [0] [0] [0] [0] [0]
# 8 3096355 [1] [0] [0] [0] [0] [0] [0]
# 9 3096951 [1] [0] [0] [1] [0] [0] [0]
#10 3096328 [1] [0] [0] [0] [0] [0] [0]
# … with 114 more rows
So, for those cases, we need to create the sequence by the grouping variable
df %>%
group_by(reader) %>%
mutate(rn = row_number()) %>% # recreated unique identifier column
pivot_wider(names_from = reader, values_from = event)
# A tibble: 139 x 9
# file_number rn aa ae db dl mk mm np
# <chr> <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
# 1 3098129 1 fail <NA> <NA> <NA> <NA> <NA> <NA>
# 2 3096451 2 fail <NA> <NA> <NA> <NA> <NA> <NA>
# 3 3096774 3 fail <NA> <NA> <NA> <NA> <NA> <NA>
# 4 3095276 4 fail <NA> <NA> <NA> <NA> <NA> <NA>
# 5 3095464 5 pass <NA> <NA> <NA> <NA> <NA> <NA>
# 6 3096846 6 fail <NA> <NA> <NA> <NA> <NA> <NA>
# 7 3097132 7 fail <NA> <NA> <NA> <NA> <NA> <NA>
# 8 3096355 8 pass <NA> <NA> <NA> <NA> <NA> <NA>
# 9 3096951 9 fail <NA> <NA> <NA> <NA> <NA> <NA>
#10 3096328 10 fail <NA> <NA> <NA> <NA> <NA> <NA>
# … with 129 more rows
The columns are all factors
because in the data.frame
call, if we don't. specify the stringsAsFactors = FALSE
, by default it would be TRUE
str(df)
#'data.frame': 10 obs. of 3 variables:
# $ id : Factor w/ 5 levels "1","2","3","4",..: 1 2 3 4 5 1 2 3 4 5
# $ reader: Factor w/ 2 levels "aa","bb": 1 1 1 1 1 2 2 2 2 2
# $ event : Factor w/ 2 levels "0","1": 2 2 1 1 1 2 1 2 1 2
Instead, specify stringsAsFactors = FALSE
and columns will be character
df <- data.frame(id = as.character(rep(seq(1:5),2)),
reader = c("aa","aa","aa","aa","aa","bb","bb","bb","bb","bb"),
event = as.character(rbinom(10, size = 1, prob=0.5)),
stringsAsFactors = FALSE
)
TL;DR
If you will end up with values that can't compose a vector, you will get a list instead.
This will happen for instance if pivot_wider
finds and combines multiple values into a list because it couldn't uniquely identify a record, or because the values are not all of the same basic type, or because any value is not a basic type or can't properly compose a vector such as NULL
.
More details:
In your example, you have a duplicate record:
df %>%
filter(duplicated(.))
# # A tibble: 1 x 3
# file_number reader event
# <chr> <chr> <chr>
# 1 3098477 mk fail
Because there are multiple event
for the same file_number
+ reader
, pivot_wider
doesn't know what to do with it other than combining them in a list, and the event
column is now to a list of lists with these values combined, as warned with Values in `event` are not uniquely identified; output will contain list-cols.
Values in `event` are not uniquely identified; output will contain list-cols.
:
df %>%
pivot_wider(names_from = reader, values_from = event) %>%
filter(file_number == "3098477") %>%
select(mk) %>%
glimpse
# Warning: Values are not uniquely identified; output will contain list-cols.
# * Use `values_fn = list` to suppress this warning.
# * Use `values_fn = length` to identify where the duplicates arise
# * Use `values_fn = {summary_fun}` to summarise duplicates
# Rows: 1
# Columns: 1
# $ mk <list> <"fail", "fail">
If this is was mistake, or if you don't really care about duplicates records you can:
df %>%
unique %>%
pivot_wider(names_from = reader, values_from = event)
# # A tibble: 124 x 8
# file_number aa ae db dl mk mm np
# <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
# 1 3098129 fail NA NA NA NA NA NA
# 2 3096451 fail NA NA NA NA NA NA
# 3 3096774 fail NA NA NA NA NA fail
# 4 3095276 fail NA NA NA NA NA NA
# 5 3095464 pass NA NA NA NA NA NA
# 6 3096846 fail NA NA NA NA NA NA
# 7 3097132 fail NA NA NA NA NA NA
# 8 3096355 pass NA NA NA NA NA NA
# 9 3096951 fail NA NA pass NA NA NA
# 10 3096328 fail NA NA NA NA NA NA
# # … with 114 more rows
Alternatively, if you do expect duplicates or even multiple distinct values for the same file_number
+ reader
, you can teach pivot_wider
how to combine these values with a function:
df %>%
pivot_wider(id_cols = file_number, names_from = reader, values_from = event, values_fn = function(values) paste(values, collapse = ", ")) %>%
filter(file_number == "3098477")
# # A tibble: 1 x 8
# file_number aa ae db dl mk mm np
# <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
# 1 3098477 NA NA NA NA fail, fail NA NA
Finally, if you want to keep multiple rows with an entry for each value
per file_number
+ reader
, adding another column with an artificial unique identifier will do the trick.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.