I would like to collapse the following dataframe
df
Chromosome Start End lengthMB imba log2 Cn mCn Cn_
chr1 0 8022945 8.023 0.026905119 -0.001671481 2 1 1.99
chr1 8022945 9168284 1.145 0.030441784 0.000601976 2 1 2
chr1 9168284 9598904 0.431 NA -0.024952441 2 1 1.91
chr1 9598904 31392788 21.794 0.036011994 0.002151497 3 1 3.01
chr2 0 8022930 8.023 0.026905119 -0.001671481 3 1 2.89
chr2 8022930 9168284 1.145 0.030441784 0.000601976 2 1 1.87
chr2 9168284 9598904 0.431 NA -0.024952441 2 1 1.57
chr2 9598904 31392788 21.794 0.036011994 0.002151497 2 0 1.87
chr2 31392788 35402000 1.164 0.029733771 0.003149921 2 1 2.01
chr3 0 8040000 1.479 NA 0.000969256 2 1 2
chr3 8040000 9168284 8.185 0.033499045 -0.031338811 1 0 0.89
chr3 9168284 9598904 3.952 0.036792754 0.002847936 1 0 0.78
chr3 9598904 31392788 0.883 0.049003807 -0.021413391 2 1 1.92
chr3 31392788 35402000 4.095 0.037653564 0.011944688 2 1 2.04
chr4 0 8022930 11.065 0.035092332 -0.022844471 2 1 1.91
chr4 8022930 9168284 40.635 0.037690844 0.006703603 2 1 2.02
chr4 9168284 9598904 0.545 0.047435696 -0.021068024 2 1 1.92
By matching only consecutive rows that have the same Cn and mCn values I want to collapse the rows. For example for the first 4 rows, we have the following:
Chromosome Start End lengthMB imba log2 Cn mCn Cn_
chr1 0 8022945 8.023 0.026905119 -0.001671481 2 1 1.99
chr1 8022945 9168284 1.145 0.030441784 0.000601976 2 1 2
chr1 9168284 9598904 0.431 NA -0.024952441 2 1 1.91
chr1 9598904 31392788 21.794 0.036011994 0.002151497 3 1 3.01
I want to collapse consecutive rows where they have the same Cn and mCn score, so for the first three rows that each has a "2" and "1" on the Cn and mCn column respectively and also to change the End column to reflect this collapse.
Chromosome Start End lengthMB imba log2 Cn mCn Cn_
chr1 0 9598904 8.023 0.026905119 -0.001671481 2 1 1.99
But I would also like to change the Cn_column
so that it is the weighted average Cn_dependant
on what the lengthMB
score is for that row. So for the first three rows the calculation would be:
((8.023/9.599) * 1.99) + ((1.145/9.599) * 2) + ((0.431/9.599) * 1.91) = 1.987
output for the first four unique Chromosomes:
Chromosome Start End lengthMB imba log2 Cn mCn Cn_
chr1 0 9598904 8.023 0.026905119 -0.001671481 2 1 1.99
chr1 9598904 31392788 21.794 0.036011994 0.002151497 3 1 3.01
chr2 0 8022930 8.023 0.026905119 -0.001671481 3 1 2.89
chr2 8022930 9598904 1.145 0.030441784 0.000601976 2 1 1.79
chr2 9598904 31392788 21.794 0.036011994 0.002151497 2 0 1.87
chr2 31392788 35402000 1.164 0.029733771 0.003149921 2 1 2.01
chr3 0 8040000 1.479 NA 0.000969256 2 1 2
chr3 8040000 9598904 8.185 0.033499045 -0.031338811 1 0 0.836
chr3 9598904 35402000 0.883 0.049003807 -0.021413391 2 1 2.02
chr4 0 9598904 11.065 0.035092332 -0.022844471 2 1 2
Tried something like this, but I also don't know how to include the calculation...
squish_segments <- function(sample) {
setDT(sample)[, .ind:= cumsum(c(TRUE,Start[-1]!=End[-.N])),
list(lengthMB, probes, snps, imba, log2, Cn, mCn, Cn_)][,
list(Chr=Chromosome[1], Start=Start[1], End=End[.N]),
list(lengthMB, probes, snps, imba, log2, Cn, mCn, Cn_, .ind)][,.ind:=NULL][]
}
First, please make your questions more reproducible by providing the dput
output of your dataset.
I think this is what you want at a low level.
setkey(df, Chromosome, Cn, mCn, Start)
df[, list(
Start=min(Start),
End=max(End),
lengthMB=lengthMB[1],
imba=imba[1],
log2=log2[1],
Cn_=weighted.mean(Cn_, lengthMB)
), keyby=list(Chromosome, Cn , mCn)]
This is a dplyr
approach.
library(dplyr)
df = read.table(text=
"Chromosome Start End lengthMB imba log2 Cn mCn Cn_
chr1 0 8022945 8.023 0.026905119 -0.001671481 2 1 1.99
chr1 8022945 9168284 1.145 0.030441784 0.000601976 2 1 2
chr1 9168284 9598904 0.431 NA -0.024952441 2 1 1.91
chr1 9598904 31392788 21.794 0.036011994 0.002151497 3 1 3.01
chr2 0 8022930 8.023 0.026905119 -0.001671481 3 1 2.89
chr2 8022930 9168284 1.145 0.030441784 0.000601976 2 1 1.87
chr2 9168284 9598904 0.431 NA -0.024952441 2 1 1.57
chr2 9598904 31392788 21.794 0.036011994 0.002151497 2 0 1.87
chr2 31392788 35402000 1.164 0.029733771 0.003149921 2 1 2.01
chr3 0 8040000 1.479 NA 0.000969256 2 1 2
chr3 8040000 9168284 8.185 0.033499045 -0.031338811 1 0 0.89
chr3 9168284 9598904 3.952 0.036792754 0.002847936 1 0 0.78
chr3 9598904 31392788 0.883 0.049003807 -0.021413391 2 1 1.92
chr3 31392788 35402000 4.095 0.037653564 0.011944688 2 1 2.04
chr4 0 8022930 11.065 0.035092332 -0.022844471 2 1 1.91
chr4 8022930 9168284 40.635 0.037690844 0.006703603 2 1 2.02
chr4 9168284 9598904 0.545 0.047435696 -0.021068024 2 1 1.92", header=T)
df %>%
mutate(Consec = ifelse(Chromosome == dplyr::lag(Chromosome, default = Chromosome[1]) & ## flag consecutive matching chromosomes
Cn == dplyr::lag(Cn, default = Cn[1]) &
mCn == dplyr::lag(mCn, default = mCn[1]), 0, 1),
Consec = cumsum(Consec)) %>% ## create an id for consecutive matching chromosomes
group_by(Chromosome, Cn, mCn, Consec) %>%
summarize(Cn_ = sum(lengthMB * Cn_)/sum(lengthMB),
Start = min(Start),
End = max(End),
lengthMB = first(lengthMB),
imba= first(imba),
log2= first(log2)) %>%
ungroup() %>% ## only if you want to ungroup
select(Chromosome,Start,End, lengthMB,imba,log2,Cn,mCn,Cn_) %>% ## to re arrange column order
arrange(Chromosome, Start)
# Chromosome Start End lengthMB imba log2 Cn mCn Cn_
# (fctr) (int) (int) (dbl) (dbl) (dbl) (int) (int) (dbl)
# 1 chr1 0 9598904 8.023 0.02690512 -0.001671481 2 1 1.9876008
# 2 chr1 9598904 31392788 21.794 0.03601199 0.002151497 3 1 3.0100000
# 3 chr2 0 8022930 8.023 0.02690512 -0.001671481 3 1 2.8900000
# 4 chr2 8022930 9598904 1.145 0.03044178 0.000601976 2 1 1.7879569
# 5 chr2 9598904 31392788 21.794 0.03601199 0.002151497 2 0 1.8700000
# 6 chr2 31392788 35402000 1.164 0.02973377 0.003149921 2 1 2.0100000
# 7 chr3 0 8040000 1.479 NA 0.000969256 2 1 2.0000000
# 8 chr3 8040000 9598904 8.185 0.03349904 -0.031338811 1 0 0.8541823
# 9 chr3 9598904 35402000 0.883 0.04900381 -0.021413391 2 1 2.0187143
# 10 chr4 0 9598904 11.065 0.03509233 -0.022844471 2 1 1.9956599
Note that lag
is a dplyr
function, but also a stats
package function. I have to write dplyr::lag
otherwise there's a conflict when I try to specify default =
within lag
. I don't know if you or anyone else can replicate this issue.
Could identify unique "events" (consecutive rows with same Cn and mCn score) and then simply loop through those events and modify the rows accordingly. Not the most efficient but should do the job.
txt <- "Chromosome Start End lengthMB imba log2 Cn mCn Cn_
chr1 8022945 9168284 1.145 0.030441784 0.000601976 2 1 2
chr1 9168284 9598904 0.431 NA -0.024952441 2 1 1.91
chr1 9598904 31392788 21.794 0.036011994 0.002151497 3 1 3.01
chr2 0 8022930 8.023 0.026905119 -0.001671481 3 1 2.89
chr2 8022930 9168284 1.145 0.030441784 0.000601976 2 1 1.87
chr2 9168284 9598904 0.431 NA -0.024952441 2 1 1.57
chr2 9598904 31392788 21.794 0.036011994 0.002151497 2 0 1.87
chr2 31392788 35402000 1.164 0.029733771 0.003149921 2 1 2.01
chr3 0 8040000 1.479 NA 0.000969256 2 1 2
chr3 8040000 9168284 8.185 0.033499045 -0.031338811 1 0 0.89
chr3 9168284 9598904 3.952 0.036792754 0.002847936 1 0 0.78
chr3 9598904 31392788 0.883 0.049003807 -0.021413391 2 1 1.92
chr3 31392788 35402000 4.095 0.037653564 0.011944688 2 1 2.04
chr4 0 8022930 11.065 0.035092332 -0.022844471 2 1 1.91
chr4 8022930 9168284 40.635 0.037690844 0.006703603 2 1 2.02
chr4 9168284 9598904 0.545 0.047435696 -0.021068024 2 1 1.92"
df <- read.table(text=txt, header=T)
#identify each unique event
df$eventid <- with(df, cumsum(c(1,diff(as.numeric(factor(Chromosome)))!=0 | diff(Cn)!=0 | diff(mCn)!=0)))
#loop through events
for(i in 1:max(df$eventid)){
#identify rows in df with ith event
rows.i <- which(df$eventid == i)
df[rows.i,] <- within(df[rows.i,],{
#calculate values of interest and assign to first row of event
Start[1] <- min(Start)
End[1] <- max(End)
Cn_[1] <- sum((lengthMB/sum(lengthMB))*Cn_)
lengthMB[1] <- sum(lengthMB)
})
#drop all but first row
if(length(rows.i) > 1) df <- df[-rows.i[-1],]
} #end i
result
> df
Chromosome Start End lengthMB imba log2 Cn mCn Cn_ eventid
1 chr1 8022945 9598904 1.576 0.03044178 0.000601976 2 1 1.9753871 1
3 chr1 9598904 31392788 21.794 0.03601199 0.002151497 3 1 3.0100000 2
4 chr2 0 8022930 8.023 0.02690512 -0.001671481 3 1 2.8900000 3
5 chr2 8022930 9598904 1.576 0.03044178 0.000601976 2 1 1.7879569 4
7 chr2 9598904 31392788 21.794 0.03601199 0.002151497 2 0 1.8700000 5
8 chr2 31392788 35402000 1.164 0.02973377 0.003149921 2 1 2.0100000 6
9 chr3 0 8040000 1.479 NA 0.000969256 2 1 2.0000000 7
10 chr3 8040000 9598904 12.137 0.03349904 -0.031338811 1 0 0.8541823 8
12 chr3 9598904 35402000 4.978 0.04900381 -0.021413391 2 1 2.0187143 9
14 chr4 0 9598904 52.245 0.03509233 -0.022844471 2 1 1.9956599 10
If I understood your question properly, you can do it in one line with data.table
fast grouping.
library(data.table)
dt[, Cn_dependent := sum((lengthMB/sum(lengthMB)) * Cn_),
by = .(Chromosome, Cn, mCn)]
to get this:
> dt
Chromosome Start End lengthMB imba log2 Cn mCn Cn_ Cn_dependent
1: chr1 0 8022945 8.023 0.02690512 -0.001671481 2 1 1.99 1.987601
2: chr1 8022945 9168284 1.145 0.03044178 0.000601976 2 1 2.00 1.987601
3: chr1 9168284 9598904 0.431 NA -0.024952441 2 1 1.91 1.987601
4: chr1 9598904 31392788 21.794 0.03601199 0.002151497 3 1 3.01 3.010000
5: chr2 0 8022930 8.023 0.02690512 -0.001671481 3 1 2.89 2.890000
6: chr2 8022930 9168284 1.145 0.03044178 0.000601976 2 1 1.87 1.882285
7: chr2 9168284 9598904 0.431 NA -0.024952441 2 1 1.57 1.882285
8: chr2 9598904 31392788 21.794 0.03601199 0.002151497 2 0 1.87 1.870000
9: chr2 31392788 35402000 1.164 0.02973377 0.003149921 2 1 2.01 1.882285
To collapse by Chromosome
, Cn
and mCn
, you could use keys and unique
.
> setkey(dt, "Chromosome", "Cn", "mCn")
> unique(dt)
Chromosome Start End lengthMB imba log2 Cn mCn Cn_ Cn_dependent
1: chr1 0 8022945 8.023 0.02690512 -0.001671481 2 1 1.99 1.987601
2: chr1 9598904 31392788 21.794 0.03601199 0.002151497 3 1 3.01 3.010000
3: chr2 9598904 31392788 21.794 0.03601199 0.002151497 2 0 1.87 1.870000
4: chr2 8022930 9168284 1.145 0.03044178 0.000601976 2 1 1.87 1.882285
5: chr2 0 8022930 8.023 0.02690512 -0.001671481 3 1 2.89 2.890000
Here is a dput
of the data.table
I started with:
> dput(dt)
structure(list(Chromosome = structure(c(1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L), .Label = c("chr1", "chr2"), class = "factor"), Start = c(0L,
8022945L, 9168284L, 9598904L, 0L, 8022930L, 9168284L, 9598904L,
31392788L), End = c(8022945L, 9168284L, 9598904L, 31392788L,
8022930L, 9168284L, 9598904L, 31392788L, 35402000L), lengthMB = c(8.023,
1.145, 0.431, 21.794, 8.023, 1.145, 0.431, 21.794, 1.164), imba = c(0.026905119,
0.030441784, NA, 0.036011994, 0.026905119, 0.030441784, NA, 0.036011994,
0.029733771), log2 = c(-0.001671481, 0.000601976, -0.024952441,
0.002151497, -0.001671481, 0.000601976, -0.024952441, 0.002151497,
0.003149921), Cn = c(2L, 2L, 2L, 3L, 3L, 2L, 2L, 2L, 2L), mCn = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L), Cn_ = c(1.99, 2, 1.91, 3.01,
2.89, 1.87, 1.57, 1.87, 2.01)), .Names = c("Chromosome", "Start",
"End", "lengthMB", "imba", "log2", "Cn", "mCn", "Cn_"), class = c("data.table",
"data.frame"), row.names = c(NA, -9L), .internal.selfref = <pointer: 0x26abf68>)
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.