简体   繁体   中英

return the name of the column with highest and 2nd highest and 3rd highest value in R

Struggling to get this right and hoping someone could help...

I have a dataset in which I am trying to get the row_max to be the highest value between columns A:N, and row_min should be the 2nd largest (not zeros).

I tried the following to get row_max:

tmp$row_max = colnames(tmp[,5:9])[apply(tmp[,5:9],1,which.max)]

which.min returns a column that contains a 0

           contig  pos ref cov  A  T  C  G N row_max 
1: NW_017095466.1  130   N  41 39  2  0  0 0       A 
2: NW_017095466.1  166   N  48  0 46  2  0 0       T 
3: NW_017095466.1  427   N  52 50  0  0  2 0       A 
4: NW_017095466.1 1736   N  54 44  0 10  0 0       A 
5: NW_017095466.1 1918   N  46  0  0  3 43 0       G 
6: NW_017095466.1 2688   N  52  5  0 47  0 0       C 

I can sort the columns and select the 2nd highest, but this gives me the value, struggling to get the column_name to be returned:

apply(tmp[,5:9], 1, FUN = function(x) sort(x)[4])

Is there a neat tidyverse solution to this?

dput(tmp)
structure(list(contig = c("NW_017095466.1", "NW_017095466.1", 
"NW_017095466.1", "NW_017095466.1", "NW_017095466.1", "NW_017095466.1"
), pos = c(130L, 166L, 427L, 1736L, 1918L, 2688L), ref = c("N", 
"N", "N", "N", "N", "N"), cov = c(41L, 48L, 52L, 54L, 46L, 52L
), A = c(39L, 0L, 50L, 44L, 0L, 5L), T = c(2L, 46L, 0L, 0L, 0L, 
0L), C = c(0L, 2L, 0L, 10L, 3L, 47L), G = c(0L, 0L, 2L, 0L, 43L, 
0L), N = c(0L, 0L, 0L, 0L, 0L, 0L), row_max = c("A", "T", "A", 
"A", "G", "C"), row_min = c("C", "A", "T", "T", "A", "T")), row.names = c(NA, 
-6L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x7f94c30168e0>)

You can easily do this with order() , which will give you the the position of the elements if they were sorted. Use this to select the names in the right order and you will get a matrix of names sorted by the size of the elements.

library(tidyverse)

tmp <- structure(list(contig = c("NW_017095466.1", "NW_017095466.1", "NW_017095466.1", "NW_017095466.1", "NW_017095466.1", "NW_017095466.1"), pos = c(130L, 166L, 427L, 1736L, 1918L, 2688L), ref = c("N", "N", "N", "N", "N", "N"), cov = c(41L, 48L, 52L, 54L, 46L, 52L), A = c(39L, 0L, 50L, 44L, 0L, 5L), T = c(2L, 46L, 0L, 0L, 0L, 0L), C = c(0L, 2L, 0L, 10L, 3L, 47L), G = c(0L, 0L, 2L, 0L, 43L, 0L), N = c(0L, 0L, 0L, 0L, 0L, 0L), row_max = c("A", "T", "A", "A", "G", "C"), row_min = c("C", "A", "T", "T", "A", "T")), row.names = c(NA, -6L), class = c("data.table", "data.frame"))

tmp[5:9] %>% 
  apply(1,\(row) rev(names(.)[order(row)])) %>%
  t()
#>      [,1] [,2] [,3] [,4] [,5]
#> [1,] "A"  "T"  "N"  "G"  "C" 
#> [2,] "T"  "C"  "N"  "G"  "A" 
#> [3,] "A"  "G"  "N"  "C"  "T" 
#> [4,] "A"  "C"  "N"  "G"  "T" 
#> [5,] "G"  "C"  "N"  "T"  "A" 
#> [6,] "C"  "A"  "N"  "G"  "T"

Here's an approach excluding 0 for the min part. Also keep in mind that apply converts to character if the data has a mix of numeric and character , so an explicit conversion to numeric avoids odd spaces.

data.frame(tmp, t(apply(tmp, 1, function(x){ nuc_num <- as.numeric(x[5:8])
  c(row_max = colnames(tmp)[5:8][which.max(nuc_num)], 
    row_min = colnames(tmp)[5:8][which(nuc_num > 0)[which.min(nuc_num[nuc_num > 0])]]) })))
          contig  pos ref cov  A  T  C  G N row_max row_min
1 NW_017095466.1  130   N  41 39  2  0  0 0       A       T
2 NW_017095466.1  166   N  48  0 46  2  0 0       T       C
3 NW_017095466.1  427   N  52 50  0  0  2 0       A       G
4 NW_017095466.1 1736   N  54 44  0 10  0 0       A       C
5 NW_017095466.1 1918   N  46  0  0  3 43 0       G       C
6 NW_017095466.1 2688   N  52  5  0 47  0 0       C       A

With dplyr

library(dplyr)

tmp %>% 
  rowwise() %>% 
  mutate(nuc = list(names(across(A:G))), 
         row_max = nuc[which.max(across(A:G))], 
         nuc_nzero = list(which(across(A:G) > 0)), 
         row_min = nuc[nuc_nzero[which.min(across(A:G)[nuc_nzero])]], 
         nuc = NULL, nuc_nzero = NULL) %>% 
  ungroup()
# A tibble: 6 × 11
  contig           pos ref     cov     A     T     C     G     N row_max row_min
  <chr>          <int> <chr> <int> <int> <int> <int> <int> <int> <chr>   <chr>
1 NW_017095466.1   130 N        41    39     2     0     0     0 A       T
2 NW_017095466.1   166 N        48     0    46     2     0     0 T       C
3 NW_017095466.1   427 N        52    50     0     0     2     0 A       G
4 NW_017095466.1  1736 N        54    44     0    10     0     0 A       C
5 NW_017095466.1  1918 N        46     0     0     3    43     0 G       C
6 NW_017095466.1  2688 N        52     5     0    47     0     0 C       A

Data

tmp <- structure(list(contig = c("NW_017095466.1", "NW_017095466.1",
"NW_017095466.1", "NW_017095466.1", "NW_017095466.1", "NW_017095466.1"
), pos = c(130L, 166L, 427L, 1736L, 1918L, 2688L), ref = c("N",
"N", "N", "N", "N", "N"), cov = c(41L, 48L, 52L, 54L, 46L, 52L
), A = c(39L, 0L, 50L, 44L, 0L, 5L), T = c(2L, 46L, 0L, 0L, 0L,
0L), C = c(0L, 2L, 0L, 10L, 3L, 47L), G = c(0L, 0L, 2L, 0L, 43L,
0L), N = c(0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(NA, -6L), class = "data.frame")

Here is another solution:


library(tidyverse)

tmp |> 
  mutate(row = row_number()) |> 
  group_by(row) |> 
  pivot_longer(cols = A:N) |> 
  mutate(max = name[value == max(value)]) |> 
  mutate(value = ifelse(value == 0, NA, value)) |> 
  mutate(tm = min(value, na.rm = T)) |> 
  mutate(value = ifelse(is.na(value), 0, value)) |> 
  mutate(min = name[value == tm]) |> 
  pivot_wider(names_from = name, values_from = value) |> 
  ungroup() |> 
  select(-row, -tm) |> 
  relocate(c(min, max), .after = last_col())
#> # A tibble: 6 × 11
#>   contig           pos ref     cov     A     T     C     G     N min   max  
#>   <chr>          <int> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
#> 1 NW_017095466.1   130 N        41    39     2     0     0     0 T     A    
#> 2 NW_017095466.1   166 N        48     0    46     2     0     0 C     T    
#> 3 NW_017095466.1   427 N        52    50     0     0     2     0 G     A    
#> 4 NW_017095466.1  1736 N        54    44     0    10     0     0 C     A    
#> 5 NW_017095466.1  1918 N        46     0     0     3    43     0 C     G    
#> 6 NW_017095466.1  2688 N        52     5     0    47     0     0 A     C

since your sample dataset seems to be a data.table , here is a data.table approach

# create temporary dataset
tempdata <- melt(tmp, measure.vars = patterns("^[A-Z]$"))[order(-value), .SD[1:2], by = .(pos)][!value == 0,]
# join
tmp[tempdata[, paste0(variable, collapse = ","), by = .(pos)], 
    c("row.max", "row.min") := tstrsplit(i.V1, ","), on = .(pos)][]
#            contig  pos ref cov  A  T  C  G N row.max row.min
# 1: NW_017095466.1  130   N  41 39  2  0  0 0       A       T
# 2: NW_017095466.1  166   N  48  0 46  2  0 0       T       C
# 3: NW_017095466.1  427   N  52 50  0  0  2 0       A       G
# 4: NW_017095466.1 1736   N  54 44  0 10  0 0       A       C
# 5: NW_017095466.1 1918   N  46  0  0  3 43 0       G       C
# 6: NW_017095466.1 2688   N  52  5  0 47  0 0       C       A

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM