在数据框中将 ENSEMBL ID 转换为基因 ID

Question

我有一个由 ensembl_gene_id 列出的 RNA-seq 数据的大型数据表，但我想转换为 hgnc_symbol，以便于热图上的可视化。

到目前为止，我有以下代码 - 但不知道如何继续。 从一开始就转换名称会更好，还是仅在子集数据上转换？

I am also more familiar with python, and normally, I would use a dictionary to map ensembl_gene_id and hgnc_symbol, but in R, not sure how to go about this. 我的直觉说 for 循环是不可扩展的。

任何建议，将不胜感激。

library(biomaRt)
library(RColorBrewer)
#Load ggplot2 for graphing
#library(ggplot2)

#Load the Gene Expression File. This one is MEAN TPM for genes across cell types.
GE_file <- read.csv(file = "mean_tpm_merged.csv")

#Get the header names of this file
headers <- names(GE_file)

# define biomart object
mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")

# query biomart

#Define Genes of Interest
GOI <- c("TFEB", "RAC1", "TFE3", "RAB5A")

# get the mapping of GOI and ENSEMBL IDs and create a dictionary
IDs <- getBM(attributes = c("ensembl_gene_id","hgnc_symbol"),
                 filters = "hgnc_symbol", values = GOI,
                 mart = mart)

# make the row names the ENSMBL IDs
row.names(IDs) <- IDs[,2]

# Look by rows of interest for this data out of the large dataset
Data_subset <- subset(GE_file, gene %in% IDs$ensembl_gene_id)

# make the row names ENSMBL IDs
row.names(Data_subset) <- Data_subset[,1]

# delete the first row as its not needed for the numerical matrix
Data_subset_matrix <- as.matrix(Data_subset[,2:16])

# colors should be green/red if possible, or whatever is color blind compatible.
# should go row-wise for the coloring.
# excise colors for B cells/NK cells/CD8 T cells.
my_palette <- colorRampPalette(c("red","green"))(n = 299)
heatmap(Data_subset_matrix, Colv = NA, Rowv = NA, scale = 'row', col = my_palette)

一些相关的输出：

> dput(head(GE_file))
structure(list(gene = c("ENSG00000223116", "ENSG00000233440", 
"ENSG00000207157", "ENSG00000229483", "ENSG00000252952", "ENSG00000235205"
), T.cell..CD4..naive..activated. = c(0, 0.0034414596504, 0, 
0, 0, 0), NK.cell..CD56dim.CD16. = c(0, 0, 0, 0, 0, 0.0139463278778
), T.cell..CD4..TFH = c(0, 0, 0, 0, 0, 0), T.cell..CD4..memory.TREG = c(0, 
0, 0, 0, 0, 0.000568207845073), T.cell..CD4..TH1.17 = c(0, 0.0196376949773, 
0, 0, 0, 0), B.cell..naive = c(0, 0, 0, 0, 0, 0), T.cell..CD4..TH2 = c(0, 
0, 0, 0, 0, 0), T.cell..CD4..TH1 = c(0, 0, 0, 0, 0, 0.000571213481481
), T.cell..CD4..naive = c(0, 0, 0, 0, 0, 0), T.cell..CD4..TH17 = c(0, 
0.00434618468012, 0, 0, 0, 0), Monocyte..classical = c(0, 0, 
0, 0, 0, 0), Monocyte..non.classical = c(0, 0, 0, 0, 0, 0), T.cell..CD4..naive.TREG = c(0, 
0, 0, 0, 0, 0.000821516453853), T.cell..CD8..naive = c(0, 0, 
0, 0, 0, 0.000508869486411), T.cell..CD8..naive..activated. = c(0, 
0.00348680689669, 0, 0, 0, 0)), row.names = c(NA, 6L), class = "data.frame")

Answer 1

一切尽在 go：

mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")

IDs <- getBM(attributes = c("ensembl_gene_id","hgnc_symbol"),
                 filters = "ensembl_gene_id", values = GE_file[,1],
                 mart = mart)

head(IDs)
  ensembl_gene_id hgnc_symbol
1 ENSG00000207157      RNY3P4
2 ENSG00000229483   LINC00362
3 ENSG00000233440     HMGA1P6
4 ENSG00000235205    TATDN2P3
5 ENSG00000252952    RNU6-58P

GOI <- c("RNY3P4", "TATDN2P3")

简单的方法，将主表中的 ensembl id 子集化，并根据以下方法对数据集进行子集化：

GOI_ens = IDs$ensembl_gene_id[IDs$hgnc_symbol %in% GOI]

Data_subset = subset(GE_file,gene %in% GOI_ens)[,-1]

字典方式，总有你可以做的，但你需要确保没有重复的符号：

dedup = !duplicated(IDs$hgnc_symbol)
dict = tapply(IDs$hgnc_symbol,IDs$ensembl_gene_id,unique)
subset(GE_file,dict[gene] %in% GOI)

在数据框中将 ENSEMBL ID 转换为基因 ID

问题描述

1 个解决方案

解决方案1
0 2020-06-05 22:02:46

在数据框中将 ENSEMBL ID 转换为基因 ID

问题描述

1 个解决方案

解决方案1 0 2020-06-05 22:02:46

解决方案1
0 2020-06-05 22:02:46