简体   繁体   中英

Trying to convert Ensembl ID to gene name in R (biomaRt)

I have a large dataset of gene expression data and I'm trying to convert the gene identifiers into gene names using biomaRt in RStudio, but for some reason when I use the merge function on my data frames, my entire data table is merged wrong/erased. I've looked at the previous questions here , but no matter what I try, my code doesn't seem to work properly. Thank you infinitely!

library(biomaRt)
    
resdata <- merge(as.data.frame(res), as.data.frame(counts(dds, normalized=TRUE)), by="row.names", sort=FALSE)
names(resdata)[1] <- "genes"
head(resdata)

## Write results
resdata <- resdata[complete.cases(resdata), ]

dim(resdata)

The problems start here:

#to convert gene accession number to gene name

charg <- resdata$genes
head(charg)

charg2 = sapply(strsplit(charg, '.', fixed=T), function(x) x[1])

ensembl = useMart("ensembl",dataset="hsapiens_gene_ensembl")

theBM = getBM(attributes='hgnc_symbol', 
          filters = 'ensembl_gene_id', 
          values = charg2, 
          mart = ensembl)

resdata <- merge.data.frame(resdata, theBM, by.x="genes",by.y="hgnc_symbol")

# a <- c(resdata[3])
# counts_resdata <-counts[resdata$ensembl_gene_id,]
# row.names(counts_resdata) <- resdata[,"V1"]
# cal_z_score <- function(x){
#   (x - mean(x)) / sd(x)
# }

write.csv(resdata, file="diffexprresultsHEK.csv")
dev.off()

> dput(head(resdata))
structure(list(genes = structure(c("ENSG00000261150.2", "ENSG00000164877.18", 
"ENSG00000120334.15", "ENSG00000100906.10", "ENSG00000182759.3", 
"ENSG00000124145.6"), class = "AsIs"), baseMean = c(4093.85581350533, 
2362.58393155573, 3727.90538524843, 6269.83601940967, 1514.2066991352, 
4802.56186913745), log2FoldChange = c(-7.91660950515258, -5.26346217291626, 
3.32325541003148, 2.95482654632078, -5.67082078657074, 2.79396304109662
), lfcSE = c(0.192088463317979, 0.149333035266368, 0.105355230912976, 
0.097569264524605, 0.194208068005162, 0.0965853229316347), stat = c(-41.2133522670104, 
-35.2464688307429, 31.5433356391815, 30.2843990955331, -29.1997178326289, 
28.9274079776516), pvalue = c(0, 3.88608699685236e-272, 2.21307385030673e-218, 
1.83983881587879e-201, 1.95527687476496e-187, 5.40010609376884e-184
), padj = c(0, 3.9601169541424e-268, 1.50348860477005e-214, 9.3744387266064e-198, 
7.97009959691694e-184, 1.83432603828505e-180), `HEK-FUS1-1.counts` = c(8260.9703617894, 
5075.51515177084, 665.085490083024, 1513.61286043731, 3440.18729968435, 
1262.3583419615), `HEK-FUS1-2.counts` = c(8046.96326903085, 4134.79795973702, 
690.697680591815, 1346.52518701783, 2499.92325557892, 1154.73922910593
), `HEK-H149A-1.counts` = c(34.3284200812733, 113.825813953696, 
6450.12945737609, 10806.2252897945, 60.5264248801398, 8302.96076228903
), `HEK-H149A-2.counts` = c(33.1612031197744, 126.196800761364, 
7105.70891294277, 11412.980740389, 56.1898163973955, 8490.18914319335
)), row.names = c(NA, 6L), class = "data.frame")

Here's some output (where I'm struggling):

> head(charg)
[1] "ENSG00000261150.2"  "ENSG00000164877.18" "ENSG00000120334.15"
[4] "ENSG00000100906.10" "ENSG00000182759.3"  "ENSG00000124145.6" 

> dim(theBM)
[1] 0 1

> head(theBM)
[1] ensembl_gene_id
<0 rows> (or 0-length row.names)

> dim(resdata)
[1] 20381    11
> resdata <- merge.data.frame(resdata, theBM, by.x="genes",by.y="ensembl_gene_id")
> dim(resdata) #after merge
[1]  0 11 #isn't correct -- just row names! where'd my genes go?

Edit: Problems solved! Turns out I was referencing getBM wrong. Thank you all!

If you want to just overwrite the Ensemble IDs with the HGNC IDs you can do it in one step:

library(biomaRt)
names(resdata)[1] <- "genes"
head(resdata)

## Write results
resdata <- resdata[complete.cases(resdata), ]

dim(resdata)

charg <- resdata$genes
head(charg)

charg2 = sapply(strsplit(charg, '.', fixed=T), function(x) x[1])

ensembl = useMart(biomart = "ensembl", dataset="hsapiens_gene_ensembl")

resdata[1] = getBM(attributes='hgnc_symbol', 
              filters = 'ensembl_gene_id', 
              values = charg2, 
              mart = ensembl)
resdata

(This keeps Log2FC as column 3, which looks right based on the next steps in your pipeline, but if you want something different let me know and I'll update my answer to suit)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM