简体   繁体   中英

R pivot_wider with multiple headers

I have a dataset that looks like this:

ROI Group ID Detections_per_Area
Brainstem A JK002 0.0327285333827735
Cerebellum A JK002 0.149208050073911
Brainstem A JK002 0.0336959892253705
Cerebellum A JK002 0

truncated: full dataset is here

structure(list(ROI = c("Brainstem", "Cerebellum", "Brainstem", 
"Cerebellum", "Brainstem", "Cerebellum", "Brainstem", "Cerebellum", 
"Brainstem", "Brainstem", "Brainstem", "Cerebellum", "Brainstem", 
"Brainstem", "Cerebellum", "Brainstem", "Brainstem", "Brainstem", 
"Cerebellum", "Cerebellum", "Brainstem", "Brainstem", "Brainstem", 
"Cerebellum", "Brainstem", "Brainstem", "Cerebellum", "Cerebellum", 
"Brainstem", "Brainstem", "Cerebellum", "Brainstem", "Brainstem", 
"Brainstem", "Cerebellum", "Cerebellum", "Brainstem", "Brainstem", 
"Brainstem", "Brainstem", "Cerebellum", "Brainstem", "Cerebellum"
), Group = c("A", "A", "A", "A", "A", "A", 
"A", "A", "A", "A", "A", "A", "A", "A", 
"A", "A", "A", "B", "B", "B", "B", "B", "B", 
"B", "B", "B", "B", "B", "B", "B", "B", "B", 
"B", "C", "C", "C", "C", "C", "C", "C", "C", 
"C", "C"), ID = c("JK002", "JK002", "JK002", "JK002", 
"JK003", "JK003", "JK003", "JK003", "JK003", "JK004", "JK004", 
"JK004", "JK004", "JK005", "JK005", "JK005", "JK005", "JK006", 
"JK006", "JK006", "JK006", "JK006", "JK007", "JK007", "JK007", 
"JK008", "JK008", "JK008", "JK008", "JK008", "JK011", "JK011", 
"JK011", "JK011", "JK009", "JK009", "JK009", "JK009", "JK009", 
"JK010", "JK010", "JK010", "JK010"), Detections_per_area = c(0.0327285333827735, 
0.149208050073911, 0.0336959892253705, 0, 0.615965559353422, 
0.117064703241855, 0.171329148144879, 0, 0.131086542762028, 0.143700717389906, 
0.0590510074394172, 0, 0.0610284572370045, 0.358989653774582, 
0.113758644699529, 0.289455536567144, 0, 0.0355596373594045, 
0, 0.0661718135522336, 0.243313220125351, 0.373564943266101, 
0.591264576854053, 0.322774099148268, 0.725656324981598, 0.46343627586687, 
0, 0.227796467592069, 0.0876146804249713, 0.297011825126973, 
0.163792893059129, 0.336883295806429, 1.08791432091601, 0.370788052321182, 
1.49652281121774, 1.18495456669418, 3.36132331547411, 0.951628515574163, 
1.63575637052095, 0.408866067869112, 0, 1.47956381894319, 0.368780820643061
)), row.names = c(NA, -43L), class = "data.frame")

I'm trying to convert it to a Graphpad prism friendly format using R. This is the intended output, with values corresponding to the Detections_per_Area filled in(x). Annoyingly, Prism requires all groups to be the same size, regardless of the number of values, hence the nulls below.

A A A A A B B B B B C C C C C
JK002 JK002 JK003 JK004 JK005 JK001 JK009 JK010 Null Null JK006 JK007 JK008 JK011 Null
Brainstem x x x x x x x x Null Null x x x x Null
Cerebellum x x x x x x x Null Null x x x x Null

Without the ID column, I have been able to generate the intended output using this code:

library(tidyr)
library(dplyr)

Count<-data_source %>% count(ROI,Group, sort = TRUE)
highest = as.numeric(Count[1,'n'])
range = seq(1, highest)


  data_wide <- data_source %>% arrange(Group) %>% 
    group_by(ROI,Group) %>% mutate(rn = row_number()) %>% complete(rn = range) %>% 
    pivot_wider(
      names_from = c(Group, rn),
      values_from = Detections_per_area
)
  
names(data_wide) = gsub(pattern = "_.*", replacement = "", x = names(data_wide))
names(data_wide) = gsub(pattern = "ROI", replacement = "", x = names(data_wide))

This counts the largest group, and then generates row numbers that correspond to that, before pivoting.

Any advice on the best way to do this would be great, I thought about adding the ID to the name, and then removing that post-pivot, but haven't had any luck. This previously asked question did the opposite.

As I said in the comments, I am not comfortable with how those IDs are reused. From what I can tell, there's no good way to determine that "0.0327285333827735" (in the 1st row) should be associated with "0.149208050073911" (in the 3rd row) or "0" (in the 4th row). The snippet below handles that correctly, assuming that the order is meaningful. But I am really uncomfortable that "Brainstem+A+JK003" has three rows, while "Cerebellum+A+JK003" has two. I strongly suggest that these IDs aren't reused.

I'm assuming that can be fixed. And that's what this block simulates. An ID can't be reused within a ROI -by- Group combination.

data_source <-
  data_source |> 
  dplyr::group_by(ROI, Group, ID) |> 
  dplyr::slice_head() |> 
  dplyr::ungroup()

First the dataset is balanced so each ROIxGroup has the same number of rows. The "Null" placeholder is used, as you specified above.

# Determine structure (including the placeholders with "Null")
ds_skeleton <-
  data_source |> 
  dplyr::select(ROI, Group) |>
  dplyr::group_by(ROI, Group) |> 
  dplyr::mutate(
    rn = dplyr::row_number(Group),
  ) |> 
  dplyr::ungroup() |> 
  tidyr::complete(ROI, Group, rn)

# Order the observed data (with holes)
d2 <-
  data_source |> 
  dplyr::select(
    ROI, 
    Group, 
    ID, 
    y = Detections_per_area,    # Shorten to help with SO's narrow window.
  ) |>
  dplyr::group_by(ROI, Group) |> 
  dplyr::mutate(
    rn    = dplyr::row_number(Group),
  ) |> 
  dplyr::ungroup()

Then join the structure with the values.

ds_long <-
  ds_skeleton |> 
  dplyr::left_join(d2, by = c("ROI", "Group", "rn")) |> 
  dplyr::mutate(
    ID  = dplyr::coalesce(ID, "Null"),
    # tag = paste(Group, rn, ID, sep = "_"),
    tag = sprintf(
      "%s_%03i_%s", # Pad the rn to accommodate "001" to "999"
      Group, rn, ID
    ),
    y  = dplyr::if_else(ID == "Null", "Null", as.character(y)),
  ) |> 
  dplyr::select(
    ROI,
    tag,
    y,
  )

# Pivot the values to the wide format
ds_wide <- 
  ds_long |> 
  tidyr::pivot_wider(
    id_cols     = "ROI",
    names_from  = "tag",
    values_from = "y"
  )

Finally the headers are constructed and stacked on top of the real data. (There might be a more concise way to do with with dplyr::across() .)

# Construct the two header rows
pattern <- "^([A-Z])_\\d{3}_(.+)$" # Extracting info from tag
ds_header <- 
  ds_wide |> 
  dplyr::slice(0) |> # Drop the rows b/c we want only the column header
  tibble::add_row(ROI = "header") |>
  tidyr::pivot_longer(
    cols      = -"ROI",
    names_to  = "tag",
    values_to = "header_1"
  ) |>
  dplyr::mutate(
    header_1 = sub(pattern, "\\1", tag),
    header_2 = sub(pattern, "\\2", tag),
  ) |> 
  dplyr::select(-ROI) |>
  tidyr::pivot_longer(
    cols = c("header_1", "header_2"), names_to = "ROI") |>
  tidyr::pivot_wider(names_from = "tag") |>
  dplyr::mutate(
    ROI = ""
  )

# Combine the headers with the data
ds_header  |> 
  dplyr::union_all(ds_wide)

To create a csv, pass col_names = FALSE to readr::write_csv() .

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM