R pivot_wider with multiple headers

Question

I have a dataset that looks like this:

ROI	Group	ID	Detections_per_Area
Brainstem	A	JK002	0.0327285333827735
Cerebellum	A	JK002	0.149208050073911
Brainstem	A	JK002	0.0336959892253705
Cerebellum	A	JK002	0

truncated: full dataset is here

structure(list(ROI = c("Brainstem", "Cerebellum", "Brainstem", 
"Cerebellum", "Brainstem", "Cerebellum", "Brainstem", "Cerebellum", 
"Brainstem", "Brainstem", "Brainstem", "Cerebellum", "Brainstem", 
"Brainstem", "Cerebellum", "Brainstem", "Brainstem", "Brainstem", 
"Cerebellum", "Cerebellum", "Brainstem", "Brainstem", "Brainstem", 
"Cerebellum", "Brainstem", "Brainstem", "Cerebellum", "Cerebellum", 
"Brainstem", "Brainstem", "Cerebellum", "Brainstem", "Brainstem", 
"Brainstem", "Cerebellum", "Cerebellum", "Brainstem", "Brainstem", 
"Brainstem", "Brainstem", "Cerebellum", "Brainstem", "Cerebellum"
), Group = c("A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "B", "B", "B", "B", "B", "B", 
"B", "B", "B", "B", "B", "B", "B", "B", "B", 
"B", "C", "C", "C", "C", "C", "C", "C", "C", 
"C", "C"), ID = c("JK002", "JK002", "JK002", "JK002", 
"JK003", "JK003", "JK003", "JK003", "JK003", "JK004", "JK004", 
"JK004", "JK004", "JK005", "JK005", "JK005", "JK005", "JK006", 
"JK006", "JK006", "JK006", "JK006", "JK007", "JK007", "JK007", 
"JK008", "JK008", "JK008", "JK008", "JK008", "JK011", "JK011", 
"JK011", "JK011", "JK009", "JK009", "JK009", "JK009", "JK009", 
"JK010", "JK010", "JK010", "JK010"), Detections_per_area = c(0.0327285333827735, 
0.149208050073911, 0.0336959892253705, 0, 0.615965559353422, 
0.117064703241855, 0.171329148144879, 0, 0.131086542762028, 0.143700717389906, 
0.0590510074394172, 0, 0.0610284572370045, 0.358989653774582, 
0.113758644699529, 0.289455536567144, 0, 0.0355596373594045, 
0, 0.0661718135522336, 0.243313220125351, 0.373564943266101, 
0.591264576854053, 0.322774099148268, 0.725656324981598, 0.46343627586687, 
0, 0.227796467592069, 0.0876146804249713, 0.297011825126973, 
0.163792893059129, 0.336883295806429, 1.08791432091601, 0.370788052321182, 
1.49652281121774, 1.18495456669418, 3.36132331547411, 0.951628515574163, 
1.63575637052095, 0.408866067869112, 0, 1.47956381894319, 0.368780820643061
)), row.names = c(NA, -43L), class = "data.frame")

I'm trying to convert it to a Graphpad prism friendly format using R. This is the intended output, with values corresponding to the Detections_per_Area filled in(x). Annoyingly, Prism requires all groups to be the same size, regardless of the number of values, hence the nulls below.

	A	A	A	A	A	B	B	B	B	B	C	C	C	C	C
	JK002	JK002	JK003	JK004	JK005	JK001	JK009	JK010	Null	Null	JK006	JK007	JK008	JK011	Null
Brainstem	x	x	x	x	x	x	x	x	Null	Null	x	x	x	x	Null
Cerebellum	x		x	x	x	x	x	x	Null	Null	x	x	x	x	Null

Without the ID column, I have been able to generate the intended output using this code:

library(tidyr)
library(dplyr)

Count<-data_source %>% count(ROI,Group, sort = TRUE)
highest = as.numeric(Count[1,'n'])
range = seq(1, highest)


  data_wide <- data_source %>% arrange(Group) %>% 
    group_by(ROI,Group) %>% mutate(rn = row_number()) %>% complete(rn = range) %>% 
    pivot_wider(
      names_from = c(Group, rn),
      values_from = Detections_per_area
)
  
names(data_wide) = gsub(pattern = "_.*", replacement = "", x = names(data_wide))
names(data_wide) = gsub(pattern = "ROI", replacement = "", x = names(data_wide))

This counts the largest group, and then generates row numbers that correspond to that, before pivoting.

Any advice on the best way to do this would be great, I thought about adding the ID to the name, and then removing that post-pivot, but haven't had any luck. This previously asked question did the opposite.

Answer 1

As I said in the comments, I am not comfortable with how those IDs are reused. From what I can tell, there's no good way to determine that "0.0327285333827735" (in the 1st row) should be associated with "0.149208050073911" (in the 3rd row) or "0" (in the 4th row). The snippet below handles that correctly, assuming that the order is meaningful. But I am really uncomfortable that "Brainstem+A+JK003" has three rows, while "Cerebellum+A+JK003" has two. I strongly suggest that these IDs aren't reused.

I'm assuming that can be fixed. And that's what this block simulates. An ID can't be reused within a ROI -by- Group combination.

data_source <-
  data_source |> 
  dplyr::group_by(ROI, Group, ID) |> 
  dplyr::slice_head() |> 
  dplyr::ungroup()

First the dataset is balanced so each ROIxGroup has the same number of rows. The "Null" placeholder is used, as you specified above.

# Determine structure (including the placeholders with "Null")
ds_skeleton <-
  data_source |> 
  dplyr::select(ROI, Group) |>
  dplyr::group_by(ROI, Group) |> 
  dplyr::mutate(
    rn = dplyr::row_number(Group),
  ) |> 
  dplyr::ungroup() |> 
  tidyr::complete(ROI, Group, rn)

# Order the observed data (with holes)
d2 <-
  data_source |> 
  dplyr::select(
    ROI, 
    Group, 
    ID, 
    y = Detections_per_area,    # Shorten to help with SO's narrow window.
  ) |>
  dplyr::group_by(ROI, Group) |> 
  dplyr::mutate(
    rn    = dplyr::row_number(Group),
  ) |> 
  dplyr::ungroup()

Then join the structure with the values.

ds_long <-
  ds_skeleton |> 
  dplyr::left_join(d2, by = c("ROI", "Group", "rn")) |> 
  dplyr::mutate(
    ID  = dplyr::coalesce(ID, "Null"),
    # tag = paste(Group, rn, ID, sep = "_"),
    tag = sprintf(
      "%s_%03i_%s", # Pad the rn to accommodate "001" to "999"
      Group, rn, ID
    ),
    y  = dplyr::if_else(ID == "Null", "Null", as.character(y)),
  ) |> 
  dplyr::select(
    ROI,
    tag,
    y,
  )

# Pivot the values to the wide format
ds_wide <- 
  ds_long |> 
  tidyr::pivot_wider(
    id_cols     = "ROI",
    names_from  = "tag",
    values_from = "y"
  )

Finally the headers are constructed and stacked on top of the real data. (There might be a more concise way to do with with dplyr::across() .)

# Construct the two header rows
pattern <- "^([A-Z])_\\d{3}_(.+)$" # Extracting info from tag
ds_header <- 
  ds_wide |> 
  dplyr::slice(0) |> # Drop the rows b/c we want only the column header
  tibble::add_row(ROI = "header") |>
  tidyr::pivot_longer(
    cols      = -"ROI",
    names_to  = "tag",
    values_to = "header_1"
  ) |>
  dplyr::mutate(
    header_1 = sub(pattern, "\\1", tag),
    header_2 = sub(pattern, "\\2", tag),
  ) |> 
  dplyr::select(-ROI) |>
  tidyr::pivot_longer(
    cols = c("header_1", "header_2"), names_to = "ROI") |>
  tidyr::pivot_wider(names_from = "tag") |>
  dplyr::mutate(
    ROI = ""
  )

# Combine the headers with the data
ds_header  |> 
  dplyr::union_all(ds_wide)

To create a csv, pass col_names = FALSE to readr::write_csv() .

R pivot_wider with multiple headers

Question

1 answers

solution1
0 2023-01-15 20:24:27

R pivot_wider with multiple headers

Question

1 answers

solution1 0 2023-01-15 20:24:27

solution1
0 2023-01-15 20:24:27