Faster processing alternative to a for loop

Question

I am using a for loop to replace the values in xen.biomart$chromosome_name (column 3) that match with chr.alias.biomart$ensembl , with the value on chr.alias.biomart$ucsc on the same chr.alias.biomart row. It works, but it takes too long to long to process (+20 min). Is there any faster alternative?

for(i in 1:nrow(xen.biomart)){
  for(x in 1:nrow(chr.alias.biomart)){
    xen.biomart[i,3][xen.biomart[i,3] == chr.alias.biomart$ensembl[x]] <- chr.alias.biomart$ucsc[x]
  }}

The xen.biomart has 146816 rows and chr.alias.biomart has 46 rows with the references of values that I want to replace.

> head(xen.biomart[,3])
[1] "MT" "MT" "MT" "MT" "MT" "MT"

> head(chr.alias.biomart)
  ensembl  ucsc assembly    genbank      refseq
1       1  chr1     Chr1 CM004443.2 NC_030677.2
2      10 chr10    Chr10 CM004452.2 NC_030686.2
3       2  chr2     Chr2 CM004444.2 NC_030678.2
4       3  chr3     Chr3 CM004445.2 NC_030679.2
5       4  chr4     Chr4 CM004446.2 NC_030680.2
6       5  chr5     Chr5 CM004447.2 NC_030681.2

> dput(xen.biomart[c(1,1000,10000,15000), ])
structure(list(ensembl_gene_id = c("ENSXETG00000034356", "ENSXETG00000034782", 
"ENSXETG00000029203", "ENSXETG00000021054"), external_gene_name = c("", 
"", "xtr-mir-144", "cdk2ap2"), chromosome_name = c("MT", "1", 
"2", "3"), start_position = c(1L, 122943147L, 34088294L, 148518850L
), end_position = c(68L, 122971793L, 34088355L, 148548901L), 
    description = c("", "", "xtr-mir-144 [Source:miRBase;Acc:MI0004938]", 
    "claudin 15, 1 [Source:Xenbase;Acc:XB-GENE-994817]")), row.names = c(1L, 
1000L, 10000L, 15000L), class = "data.frame")

> dput(chr.alias.biomart[c(1:10,46),])
structure(list(ensembl = c("1", "10", "2", "3", "4", "5", "6", 
"7", "8", "9", "MT"), ucsc = c("chr1", "chr10", "chr2", "chr3", 
"chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chrM"), assembly = c("Chr1", 
"Chr10", "Chr2", "Chr3", "Chr4", "Chr5", "Chr6", "Chr7", "Chr8", 
"Chr9", "MT"), genbank = c("CM004443.2", "CM004452.2", "CM004444.2", 
"CM004445.2", "CM004446.2", "CM004447.2", "CM004448.2", "CM004449.2", 
"CM004450.2", "CM004451.2", "MT"), refseq = c("NC_030677.2", 
"NC_030686.2", "NC_030678.2", "NC_030679.2", "NC_030680.2", "NC_030681.2", 
"NC_030682.2", "NC_030683.2", "NC_030684.2", "NC_030685.2", "NC_006839.1"
)), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 46L
), class = "data.frame")

Answer 1

This should be much faster. If you need more speed, use the data.table package.

library(dplyr)
xen.biomart = xen.biomart %>%
  ## join the relevant alias column into xen biomart
  left_join(
    select(chr.alias.biomart, ensembl, ucsc),
    by = c("chromosome_name" = "ensembl")
  ) %>%
  ## replace all chromosome_names with ucsc value (if not NA)
  mutate(chromosome_name = coalesce(ucsc, chromosome_name)) %>%
  ## drop ucsc columns
  select(-ucsc)
#      ensembl_gene_id external_gene_name chromosome_name start_position end_position
# 1 ENSXETG00000034356                               chrM              1           68
# 2 ENSXETG00000034782                               chr1      122943147    122971793
# 3 ENSXETG00000029203        xtr-mir-144            chr2       34088294     34088355
# 4 ENSXETG00000021054            cdk2ap2            chr3      148518850    148548901
#                                         description
# 1                                                  
# 2                                                  
# 3        xtr-mir-144 [Source:miRBase;Acc:MI0004938]
# 4 claudin 15, 1 [Source:Xenbase;Acc:XB-GENE-994817]

Faster processing alternative to a for loop

Question

1 answers

solution1
0 2022-09-08 20:22:34

Faster processing alternative to a for loop

Question

1 answers

solution1 0 2022-09-08 20:22:34

solution1
0 2022-09-08 20:22:34