简体   繁体   中英

collapse dataframe based on rows and calculate weighted mean r

I would like to collapse the following dataframe

df

Chromosome  Start   End lengthMB    imba    log2    Cn  mCn Cn_
chr1    0   8022945 8.023   0.026905119 -0.001671481    2   1   1.99
chr1    8022945 9168284 1.145   0.030441784 0.000601976 2   1   2
chr1    9168284 9598904 0.431   NA  -0.024952441    2   1   1.91
chr1    9598904 31392788    21.794  0.036011994 0.002151497 3   1   3.01
chr2    0   8022930 8.023   0.026905119 -0.001671481    3   1   2.89
chr2    8022930 9168284 1.145   0.030441784 0.000601976 2   1   1.87
chr2    9168284 9598904 0.431   NA  -0.024952441    2   1   1.57
chr2    9598904 31392788    21.794  0.036011994 0.002151497 2   0   1.87
chr2    31392788    35402000    1.164   0.029733771 0.003149921 2   1   2.01
chr3    0   8040000 1.479   NA  0.000969256 2   1   2
chr3    8040000 9168284 8.185   0.033499045 -0.031338811    1   0   0.89
chr3    9168284 9598904 3.952   0.036792754 0.002847936 1   0   0.78
chr3    9598904 31392788    0.883   0.049003807 -0.021413391    2   1   1.92
chr3    31392788    35402000    4.095   0.037653564 0.011944688 2   1   2.04
chr4    0   8022930 11.065  0.035092332 -0.022844471    2   1   1.91
chr4    8022930 9168284 40.635  0.037690844 0.006703603 2   1   2.02
chr4    9168284 9598904 0.545   0.047435696 -0.021068024    2   1   1.92

By matching only consecutive rows that have the same Cn and mCn values I want to collapse the rows. For example for the first 4 rows, we have the following:

Chromosome  Start   End lengthMB    imba    log2    Cn  mCn Cn_
chr1    0   8022945 8.023   0.026905119 -0.001671481    2   1   1.99
chr1    8022945 9168284 1.145   0.030441784 0.000601976 2   1   2
chr1    9168284 9598904 0.431   NA  -0.024952441    2   1   1.91
chr1    9598904 31392788    21.794  0.036011994 0.002151497 3   1   3.01

I want to collapse consecutive rows where they have the same Cn and mCn score, so for the first three rows that each has a "2" and "1" on the Cn and mCn column respectively and also to change the End column to reflect this collapse.

Chromosome  Start   End lengthMB    imba    log2    Cn  mCn Cn_
chr1    0   9598904 8.023   0.026905119 -0.001671481    2   1   1.99

But I would also like to change the Cn_column so that it is the weighted average Cn_dependant on what the lengthMB score is for that row. So for the first three rows the calculation would be:

((8.023/9.599) * 1.99) + ((1.145/9.599) * 2) + ((0.431/9.599) * 1.91) = 1.987

output for the first four unique Chromosomes:

Chromosome  Start   End lengthMB    imba    log2    Cn  mCn Cn_
chr1    0   9598904 8.023   0.026905119 -0.001671481    2   1   1.99
chr1    9598904 31392788    21.794  0.036011994 0.002151497 3   1   3.01
chr2    0   8022930 8.023   0.026905119 -0.001671481    3   1   2.89
chr2    8022930 9598904 1.145   0.030441784 0.000601976 2   1   1.79
chr2    9598904 31392788    21.794  0.036011994 0.002151497 2   0   1.87
chr2    31392788    35402000    1.164   0.029733771 0.003149921 2   1   2.01
chr3    0   8040000 1.479   NA  0.000969256 2   1   2
chr3    8040000 9598904 8.185   0.033499045 -0.031338811    1   0   0.836
chr3    9598904 35402000    0.883   0.049003807 -0.021413391    2   1   2.02
chr4    0   9598904 11.065  0.035092332 -0.022844471    2   1   2

Tried something like this, but I also don't know how to include the calculation...

squish_segments <- function(sample) {
  setDT(sample)[, .ind:= cumsum(c(TRUE,Start[-1]!=End[-.N])),
    list(lengthMB, probes, snps, imba, log2, Cn, mCn, Cn_)][,
   list(Chr=Chromosome[1], Start=Start[1], End=End[.N]),
   list(lengthMB, probes, snps, imba, log2, Cn, mCn, Cn_, .ind)][,.ind:=NULL][]
}

First, please make your questions more reproducible by providing the dput output of your dataset.

I think this is what you want at a low level.

setkey(df, Chromosome, Cn, mCn, Start)

df[, list(
  Start=min(Start), 
  End=max(End), 
  lengthMB=lengthMB[1], 
  imba=imba[1],
  log2=log2[1],
  Cn_=weighted.mean(Cn_, lengthMB) 
), keyby=list(Chromosome, Cn , mCn)]

This is a dplyr approach.

library(dplyr)

df = read.table(text=
                  "Chromosome  Start   End lengthMB    imba    log2    Cn  mCn Cn_
                chr1    0   8022945 8.023   0.026905119 -0.001671481    2   1   1.99
                chr1    8022945 9168284 1.145   0.030441784 0.000601976 2   1   2
                chr1    9168284 9598904 0.431   NA  -0.024952441    2   1   1.91
                chr1    9598904 31392788    21.794  0.036011994 0.002151497 3   1   3.01
                chr2    0   8022930 8.023   0.026905119 -0.001671481    3   1   2.89
                chr2    8022930 9168284 1.145   0.030441784 0.000601976 2   1   1.87
                chr2    9168284 9598904 0.431   NA  -0.024952441    2   1   1.57
                chr2    9598904 31392788    21.794  0.036011994 0.002151497 2   0   1.87
                chr2    31392788    35402000    1.164   0.029733771 0.003149921 2   1   2.01
                chr3    0   8040000 1.479   NA  0.000969256 2   1   2
                chr3    8040000 9168284 8.185   0.033499045 -0.031338811    1   0   0.89
                chr3    9168284 9598904 3.952   0.036792754 0.002847936 1   0   0.78
                chr3    9598904 31392788    0.883   0.049003807 -0.021413391    2   1   1.92
                chr3    31392788    35402000    4.095   0.037653564 0.011944688 2   1   2.04
                chr4    0   8022930 11.065  0.035092332 -0.022844471    2   1   1.91
                chr4    8022930 9168284 40.635  0.037690844 0.006703603 2   1   2.02
                chr4    9168284 9598904 0.545   0.047435696 -0.021068024    2   1   1.92", header=T)


df %>%
mutate(Consec = ifelse(Chromosome == dplyr::lag(Chromosome, default = Chromosome[1]) &  ## flag consecutive matching chromosomes
                         Cn == dplyr::lag(Cn, default = Cn[1]) & 
                         mCn == dplyr::lag(mCn, default = mCn[1]), 0, 1),
       Consec = cumsum(Consec)) %>%       ## create an id for consecutive matching chromosomes
group_by(Chromosome, Cn, mCn, Consec) %>%
summarize(Cn_ = sum(lengthMB * Cn_)/sum(lengthMB),
            Start = min(Start),
            End = max(End),
            lengthMB = first(lengthMB),
            imba= first(imba),
            log2= first(log2)) %>%
ungroup() %>%    ## only if you want to ungroup
select(Chromosome,Start,End, lengthMB,imba,log2,Cn,mCn,Cn_) %>%  ## to re arrange column order
arrange(Chromosome, Start)


#    Chromosome    Start      End lengthMB       imba         log2    Cn   mCn       Cn_
#        (fctr)    (int)    (int)    (dbl)      (dbl)        (dbl) (int) (int)     (dbl)
# 1        chr1        0  9598904    8.023 0.02690512 -0.001671481     2     1 1.9876008
# 2        chr1  9598904 31392788   21.794 0.03601199  0.002151497     3     1 3.0100000
# 3        chr2        0  8022930    8.023 0.02690512 -0.001671481     3     1 2.8900000
# 4        chr2  8022930  9598904    1.145 0.03044178  0.000601976     2     1 1.7879569
# 5        chr2  9598904 31392788   21.794 0.03601199  0.002151497     2     0 1.8700000
# 6        chr2 31392788 35402000    1.164 0.02973377  0.003149921     2     1 2.0100000
# 7        chr3        0  8040000    1.479         NA  0.000969256     2     1 2.0000000
# 8        chr3  8040000  9598904    8.185 0.03349904 -0.031338811     1     0 0.8541823
# 9        chr3  9598904 35402000    0.883 0.04900381 -0.021413391     2     1 2.0187143
# 10       chr4        0  9598904   11.065 0.03509233 -0.022844471     2     1 1.9956599

Note that lag is a dplyr function, but also a stats package function. I have to write dplyr::lag otherwise there's a conflict when I try to specify default = within lag . I don't know if you or anyone else can replicate this issue.

Could identify unique "events" (consecutive rows with same Cn and mCn score) and then simply loop through those events and modify the rows accordingly. Not the most efficient but should do the job.

txt <- "Chromosome  Start   End lengthMB    imba    log2    Cn  mCn Cn_
chr1    8022945 9168284 1.145   0.030441784 0.000601976 2   1   2
chr1    9168284 9598904 0.431   NA  -0.024952441    2   1   1.91
chr1    9598904 31392788    21.794  0.036011994 0.002151497 3   1   3.01
chr2    0   8022930 8.023   0.026905119 -0.001671481    3   1   2.89
chr2    8022930 9168284 1.145   0.030441784 0.000601976 2   1   1.87
chr2    9168284 9598904 0.431   NA  -0.024952441    2   1   1.57
chr2    9598904 31392788    21.794  0.036011994 0.002151497 2   0   1.87
chr2    31392788    35402000    1.164   0.029733771 0.003149921 2   1   2.01
chr3    0   8040000 1.479   NA  0.000969256 2   1   2
chr3    8040000 9168284 8.185   0.033499045 -0.031338811    1   0   0.89
chr3    9168284 9598904 3.952   0.036792754 0.002847936 1   0   0.78
chr3    9598904 31392788    0.883   0.049003807 -0.021413391    2   1   1.92
chr3    31392788    35402000    4.095   0.037653564 0.011944688 2   1   2.04
chr4    0   8022930 11.065  0.035092332 -0.022844471    2   1   1.91
chr4    8022930 9168284 40.635  0.037690844 0.006703603 2   1   2.02
chr4    9168284 9598904 0.545   0.047435696 -0.021068024    2   1   1.92"

df <- read.table(text=txt, header=T)

#identify each unique event
df$eventid <- with(df, cumsum(c(1,diff(as.numeric(factor(Chromosome)))!=0 | diff(Cn)!=0 | diff(mCn)!=0)))

#loop through events
for(i in 1:max(df$eventid)){
    #identify rows in df with ith event
    rows.i <- which(df$eventid == i)

    df[rows.i,] <- within(df[rows.i,],{
        #calculate values of interest and assign to first row of event
        Start[1] <- min(Start)
        End[1] <- max(End)
        Cn_[1] <- sum((lengthMB/sum(lengthMB))*Cn_) 
        lengthMB[1] <- sum(lengthMB)    
    })

    #drop all but first row
    if(length(rows.i) > 1) df <- df[-rows.i[-1],]

} #end i

result

> df
   Chromosome    Start      End lengthMB       imba         log2 Cn mCn       Cn_ eventid
1        chr1  8022945  9598904    1.576 0.03044178  0.000601976  2   1 1.9753871       1
3        chr1  9598904 31392788   21.794 0.03601199  0.002151497  3   1 3.0100000       2
4        chr2        0  8022930    8.023 0.02690512 -0.001671481  3   1 2.8900000       3
5        chr2  8022930  9598904    1.576 0.03044178  0.000601976  2   1 1.7879569       4
7        chr2  9598904 31392788   21.794 0.03601199  0.002151497  2   0 1.8700000       5
8        chr2 31392788 35402000    1.164 0.02973377  0.003149921  2   1 2.0100000       6
9        chr3        0  8040000    1.479         NA  0.000969256  2   1 2.0000000       7
10       chr3  8040000  9598904   12.137 0.03349904 -0.031338811  1   0 0.8541823       8
12       chr3  9598904 35402000    4.978 0.04900381 -0.021413391  2   1 2.0187143       9
14       chr4        0  9598904   52.245 0.03509233 -0.022844471  2   1 1.9956599      10

If I understood your question properly, you can do it in one line with data.table fast grouping.

library(data.table)
dt[, Cn_dependent := sum((lengthMB/sum(lengthMB)) * Cn_),
   by = .(Chromosome, Cn, mCn)]

to get this:

> dt
   Chromosome    Start      End lengthMB       imba         log2 Cn mCn  Cn_ Cn_dependent
1:       chr1        0  8022945    8.023 0.02690512 -0.001671481  2   1 1.99     1.987601
2:       chr1  8022945  9168284    1.145 0.03044178  0.000601976  2   1 2.00     1.987601
3:       chr1  9168284  9598904    0.431         NA -0.024952441  2   1 1.91     1.987601
4:       chr1  9598904 31392788   21.794 0.03601199  0.002151497  3   1 3.01     3.010000
5:       chr2        0  8022930    8.023 0.02690512 -0.001671481  3   1 2.89     2.890000
6:       chr2  8022930  9168284    1.145 0.03044178  0.000601976  2   1 1.87     1.882285
7:       chr2  9168284  9598904    0.431         NA -0.024952441  2   1 1.57     1.882285
8:       chr2  9598904 31392788   21.794 0.03601199  0.002151497  2   0 1.87     1.870000
9:       chr2 31392788 35402000    1.164 0.02973377  0.003149921  2   1 2.01     1.882285

To collapse by Chromosome , Cn and mCn , you could use keys and unique .

> setkey(dt, "Chromosome", "Cn", "mCn")
> unique(dt)
   Chromosome   Start      End lengthMB       imba         log2 Cn mCn  Cn_ Cn_dependent
1:       chr1       0  8022945    8.023 0.02690512 -0.001671481  2   1 1.99     1.987601
2:       chr1 9598904 31392788   21.794 0.03601199  0.002151497  3   1 3.01     3.010000
3:       chr2 9598904 31392788   21.794 0.03601199  0.002151497  2   0 1.87     1.870000
4:       chr2 8022930  9168284    1.145 0.03044178  0.000601976  2   1 1.87     1.882285
5:       chr2       0  8022930    8.023 0.02690512 -0.001671481  3   1 2.89     2.890000

Here is a dput of the data.table I started with:

> dput(dt)
structure(list(Chromosome = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 
2L, 2L, 2L), .Label = c("chr1", "chr2"), class = "factor"), Start = c(0L, 
8022945L, 9168284L, 9598904L, 0L, 8022930L, 9168284L, 9598904L, 
31392788L), End = c(8022945L, 9168284L, 9598904L, 31392788L, 
8022930L, 9168284L, 9598904L, 31392788L, 35402000L), lengthMB = c(8.023, 
1.145, 0.431, 21.794, 8.023, 1.145, 0.431, 21.794, 1.164), imba = c(0.026905119, 
0.030441784, NA, 0.036011994, 0.026905119, 0.030441784, NA, 0.036011994, 
0.029733771), log2 = c(-0.001671481, 0.000601976, -0.024952441, 
0.002151497, -0.001671481, 0.000601976, -0.024952441, 0.002151497, 
0.003149921), Cn = c(2L, 2L, 2L, 3L, 3L, 2L, 2L, 2L, 2L), mCn = c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L), Cn_ = c(1.99, 2, 1.91, 3.01, 
2.89, 1.87, 1.57, 1.87, 2.01)), .Names = c("Chromosome", "Start", 
"End", "lengthMB", "imba", "log2", "Cn", "mCn", "Cn_"), class = c("data.table", 
"data.frame"), row.names = c(NA, -9L), .internal.selfref = <pointer: 0x26abf68>)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM