R: How to group data and assign factor levels within different groups in a dataframe?

Question

structure(list(drug = c("Chlorambucil", "Fludarabine", "FludarabineMafosfamide", 
"NDI031301", "CMPB", "Tofacitinib", "Peficitinib", "FludarabineMafosfamide", 
"PDB", "Filgotinib", "Dexamethasone", "CMPA", "Lenalidomide", 
"Dexamethasone", "Gandotinib", "NDI031301", "Filgotinib", "PDB", 
"CMPB", "Ruxolitinib", "CC122", "Atovaquone", "CC122", "SAR20347", 
"Momelotinib", "Momelotinib", "Tofacitinib", "Fludarabine", "Fludarabine", 
"Cerdulatinib", "Lenalidomide", "Atovaquone", "Chlorambucil", 
"CMPA", "FludarabineMafosfamide", "FludarabineMafosfamide", "Fludarabine", 
"Atovaquone", "Momelotinib", "PDB", "Filgotinib", "Chlorambucil", 
"Dexamethasone", "Tofacitinib", "SAR20347", "CMPB", "Momelotinib", 
"Fludarabine", "Cerdulatinib", "Peficitinib", "Atovaquone", "CC122", 
"CMPA", "NDI031301", "PDB", "CMPA", "Lenalidomide", "SAR20347", 
"Tofacitinib", "Gandotinib", "Lenalidomide", "Peficitinib", "CMPB", 
"CC122", "Dexamethasone", "FludarabineMafosfamide", "Ruxolitinib", 
"CMPB", "Peficitinib", "Tofacitinib", "FludarabineMafosfamide", 
"Filgotinib", "Dexamethasone", "CMPA", "Dexamethasone", "Gandotinib", 
"NDI031301", "Filgotinib", "SAR20347", "CMPB", "Ruxolitinib", 
"Peficitinib", "Atovaquone", "CC122", "SAR20347", "Momelotinib", 
"Momelotinib", "Tofacitinib", "Fludarabine", "Fludarabine", "Cerdulatinib", 
"Atovaquone", "Chlorambucil", "CMPA", "NDI031301"), dose = c(1, 
1, 10, 1, 0.1, 1, 1, 1, 100, 1, 10, 1, 10, 100, 1, 10, 10, 10, 
1, 1, 0.1, 3, 1, 1, 1, 0.1, 10, 1, 10, 1, 1, 30, 30, 0.1, 0.01, 
0.1, 0.01, 0.3, 0.001, 1, 0.01, 0.3, 0.1, 0.01, 0.1, 0.001, 0.01, 
0.1, 0.01, 0.1, 0.03, 0.01, 0.01, 0.01, 0.1, 0.001, 0.01, 0.01, 
0.1, 0.01, 0.1, 0.01, 0.01, 0.001, 1, 10, 10, 0.1, 1, 1, 1, 1, 
10, 1, 100, 1, 10, 10, 10, 1, 1, 10, 3, 1, 1, 1, 0.1, 10, 10, 
1, 1, 30, 30, 0.1, 1), drug.dose = c("Chlorambucil_1uM", "Fludarabine_1uM", 
"FludarabineMafosfamide_10ug/mlplus1ug/ml", "NDI031301_1uM", 
"CMPB_0.1uM", "Tofacitinib_1uM", "Peficitinib_1uM", "FludarabineMafosfamide_1ug/mlplus1ug/ml", 
"PDB_100ng/ml", "Filgotinib_1uM", "Dexamethasone_10uM", "CMPA_1uM", 
"Lenalidomide_10uM", "Dexamethasone_100uM", "Gandotinib_1uM", 
"NDI031301_10uM", "Filgotinib_10uM", "PDB_10ng/ml", "CMPB_1uM", 
"Ruxolitinib_1uM", "CC122_0.1uM", "Atovaquone_3uM", "CC122_1uM", 
"SAR20347_1uM", "Momelotinib_1uM", "Momelotinib_0.1uM", "Tofacitinib_10uM", 
"Fludarabine_1ug/ml", "Fludarabine_10ug/ml", "Cerdulatinib_1uM", 
"Lenalidomide_1uM", "Atovaquone_30uM", "Chlorambucil_30uM", "CMPA_0.1uM", 
"FludarabineMafosfamide_0.01ug/mlplus1ug/ml", "FludarabineMafosfamide_0.1ug/mlplus1ug/ml", 
"Fludarabine_0.01ug/ml", "Atovaquone_0.3uM", "Momelotinib_0.001uM", 
"PDB_1ng/ml", "Filgotinib_0.01uM", "Chlorambucil_0.3uM", "Dexamethasone_0.1uM", 
"Tofacitinib_0.01uM", "SAR20347_0.1uM", "CMPB_0.001uM", "Momelotinib_0.01uM", 
"Fludarabine_0.1ug/ml", "Cerdulatinib_0.01uM", "Peficitinib_0.1uM", 
"Atovaquone_0.03uM", "CC122_0.01uM", "CMPA_0.01uM", "NDI031301_0.01uM", 
"PDB_0.1ng/ml", "CMPA_0.001uM", "Lenalidomide_0.01uM", "SAR20347_0.01uM", 
"Tofacitinib_0.1uM", "Gandotinib_0.01uM", "Lenalidomide_0.1uM", 
"Peficitinib_0.01uM", "CMPB_0.01uM", "CC122_0.001uM", "Dexamethasone_1uM", 
"FludarabineMafosfamide_10ug/mlplus1ug/ml", "Ruxolitinib_10uM", 
"CMPB_0.1uM", "Peficitinib_1uM", "Tofacitinib_1uM", "FludarabineMafosfamide_1ug/mlplus1ug/ml", 
"Filgotinib_1uM", "Dexamethasone_10uM", "CMPA_1uM", "Dexamethasone_100uM", 
"Gandotinib_1uM", "NDI031301_10uM", "Filgotinib_10uM", "SAR20347_10uM", 
"CMPB_1uM", "Ruxolitinib_1uM", "Peficitinib_10uM", "Atovaquone_3uM", 
"CC122_1uM", "SAR20347_1uM", "Momelotinib_1uM", "Momelotinib_0.1uM", 
"Tofacitinib_10uM", "Fludarabine_10ug/ml", "Fludarabine_1ug/ml", 
"Cerdulatinib_1uM", "Atovaquone_30uM", "Chlorambucil_30uM", "CMPA_0.1uM", 
"NDI031301_1uM"), combo = c("none", "none", "none", "none", "none", 
"none", "none", "none", "none", "none", "none", "none", "none", 
"none", "none", "none", "none", "none", "none", "none", "none", 
"none", "none", "none", "none", "none", "none", "none", "none", 
"none", "none", "none", "none", "none", "none", "none", "none", 
"none", "none", "none", "none", "none", "none", "none", "none", 
"none", "none", "none", "none", "none", "none", "none", "none", 
"none", "none", "none", "none", "none", "none", "none", "none", 
"none", "none", "none", "none", "none", "none", "none", "none", 
"none", "none", "none", "none", "none", "none", "none", "none", 
"none", "none", "none", "none", "none", "none", "none", "none", 
"none", "none", "none", "none", "none", "none", "none", "none", 
"none", "none"), cluster = c(3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 
6L, 6L, 6L, 6L, 6L, 6L, 6L), dosage = c("1uM", "1uM", "10ug/mlplus1ug/ml", 
"1uM", "0.1uM", "1uM", "1uM", "1ug/mlplus1ug/ml", "100ng/ml", 
"1uM", "10uM", "1uM", "10uM", "100uM", "1uM", "10uM", "10uM", 
"10ng/ml", "1uM", "1uM", "0.1uM", "3uM", "1uM", "1uM", "1uM", 
"0.1uM", "10uM", "1ug/ml", "10ug/ml", "1uM", "1uM", "30uM", "30uM", 
"0.1uM", "0.01ug/mlplus1ug/ml", "0.1ug/mlplus1ug/ml", "0.01ug/ml", 
"0.3uM", "0.001uM", "1ng/ml", "0.01uM", "0.3uM", "0.1uM", "0.01uM", 
"0.1uM", "0.001uM", "0.01uM", "0.1ug/ml", "0.01uM", "0.1uM", 
"0.03uM", "0.01uM", "0.01uM", "0.01uM", "0.1ng/ml", "0.001uM", 
"0.01uM", "0.01uM", "0.1uM", "0.01uM", "0.1uM", "0.01uM", "0.01uM", 
"0.001uM", "1uM", "10ug/mlplus1ug/ml", "10uM", "0.1uM", "1uM", 
"1uM", "1ug/mlplus1ug/ml", "1uM", "10uM", "1uM", "100uM", "1uM", 
"10uM", "10uM", "10uM", "1uM", "1uM", "10uM", "3uM", "1uM", "1uM", 
"1uM", "0.1uM", "10uM", "10ug/ml", "1ug/ml", "1uM", "30uM", "30uM", 
"0.1uM", "1uM")), row.names = c(NA, -95L), class = "data.frame")

Sorry rookie question here, I have this complicated drug cluster data as shown in the screen shot.

I wan to display them into a stacked geom_col sort of plot, with x axis be the "drug", Y axis as the count of occurrence, and facet it by cluster.

So far it is pretty easy. But I also want to see the distribution of these drugs and dosages in each cluster by using color fill to match their dose. The real dosage has different units etc.

I extracted the numeric dose to its own standing column. I want to assign a factor vector ("min", "low", "high", "max") to reflect the dose levels, since I know each drug has 4 different dosages.

The problem is the numeric dose is different for different drugs, so I can't simply use rank

For example some drug dose range from 0.03 to 30, some rank from 0.3 to 300, and some range from 0.01 to 10.

So how could I assign the drug level to each drug using that numeric drug dose column?

Answer 1

Here's an approach with rank() and a join. We can take advantage of the fact that each drug has the same units within the drugs.

library(dplyr)
df %>%
  arrange(drug) %>% #for visualization
  group_by(drug) %>% #group by drug
  select(dose) %>% #get rid of extra columns
  filter(!duplicated(dose)) %>% #remove duplicates
  mutate(rank = rank(dose), #rank doses, mostly for visualization of results
         category = c("min","low","high","max")[rank]) #assign category
# A tibble: 67 x 4
# Groups:   drug [19]
   drug           dose  rank category
   <chr>         <dbl> <dbl> <chr>   
 1 Atovaquone    3         3 high    
 2 Atovaquone   30         4 max     
 3 Atovaquone    0.3       2 low     
 4 Atovaquone    0.03      1 min     
 5 CC122         0.1       3 high    
 6 CC122         1         4 max     
 7 CC122         0.01      2 low     
 8 CC122         0.001     1 min     
 9 Cerdulatinib  1         2 low     
10 Cerdulatinib  0.01      1 min     
# … with 57 more rows

Now we can join back to the original data.frame:

df %>%
  arrange(drug) %>%
  group_by(drug) %>% 
  select(dose) %>%
  filter(!duplicated(dose)) %>%
  mutate(rank = rank(dose), #rank doses
         category = c("min","low","high","max")[rank]) %>%
  right_join(df)
# A tibble: 95 x 8
# Groups:   drug [19]
   drug        dose dosage  rank category drug.dose         combo cluster
   <chr>      <dbl> <chr>  <dbl> <chr>    <chr>             <chr>   <int>
 1 Atovaquone  3    3uM        3 high     Atovaquone_3uM    none        4
 2 Atovaquone  3    3uM        3 high     Atovaquone_3uM    none        6
 3 Atovaquone 30    30uM       4 max      Atovaquone_30uM   none        4
 4 Atovaquone 30    30uM       4 max      Atovaquone_30uM   none        6
 5 Atovaquone  0.3  0.3uM      2 low      Atovaquone_0.3uM  none        5
 6 Atovaquone  0.03 0.03uM     1 min      Atovaquone_0.03uM none        5
 7 CC122       0.1  0.1uM      3 high     CC122_0.1uM       none        4
 8 CC122       1    1uM        4 max      CC122_1uM         none        4
 9 CC122       1    1uM        4 max      CC122_1uM         none        6
10 CC122       0.01 0.01uM     2 low      CC122_0.01uM      none        5
# … with 85 more rows

R: How to group data and assign factor levels within different groups in a dataframe?

Question

1 answers

solution1
2 ACCPTED 2021-03-30 17:09:31

R: How to group data and assign factor levels within different groups in a dataframe?

Question

1 answers

solution1 2 ACCPTED 2021-03-30 17:09:31

solution1
2 ACCPTED 2021-03-30 17:09:31