This vignette shows how to map homologous gene symbols from Macaca fascicularis to the human genome.
require(biomaRt)
require(tidyverse)
require(SignacX)
After mapping the reads to the Macaca fasccicularis genome, we load the genes, which were generated from the output of the cellranger pipeline from 10X Genomics.
<- read.delim("fls/features.tsv.gz", header = FALSE, stringsAsFactors = FALSE)
features.tsv head(features.tsv)
## V1 V2 V3
## 1 ENSMFAG00000044637 PGBD2 Gene Expression
## 2 ENSMFAG00000039056 ZNF692 Gene Expression
## 3 ENSMFAG00000030010 ZNF672 Gene Expression
## 4 ENSMFAG00000002737 SH3BP5L Gene Expression
## 5 ENSMFAG00000000508 LYPD8 Gene Expression
## 6 ENSMFAG00000040572 ENSMFAG00000040572 Gene Expression
# get human and cyno gene symbols
<- useMart(host = "jan2019.archive.ensembl.org", biomart = "ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl")
human.R95 <- useMart(host = "jan2019.archive.ensembl.org", biomart = "ENSEMBL_MART_ENSEMBL", dataset = "mfascicularis_gene_ensembl")
cyno.R95
# values = listeENSID: list of cynomolgus ensembl IDs to be retrieved.
= features.tsv$V1
listeENSID <- getLDS(attributes = c("ensembl_gene_id", "external_gene_name"), filters = "ensembl_gene_id",
orthologs values = listeENSID, mart = cyno.R95, attributesL = c("hgnc_symbol", "ensembl_gene_id"), martL = human.R95)
<- as_tibble(orthologs)
orthologs colnames(orthologs) <- c("GeneID", "cynoSymbol", "HumanSymbol", "HumanGeneID")
# keep only 1:1 orthologs
<- orthologs %>% group_by(GeneID) %>% summarise(n()) %>% filter(`n()` <= 1) %>% dplyr::select(GeneID) %>%
one2one pull()
<- orthologs %>% filter(GeneID %in% one2one)
orthologs
# replace empty HumanSymbol (where there isn't a gene name for a homologous gene) with NA
<- orthologs %>% mutate(HumanSymbol = replace(HumanSymbol, HumanSymbol == "", NA))
orthologs <- orthologs %>% mutate(cynoSymbol = replace(cynoSymbol, cynoSymbol == "", NA))
orthologs
= match(listeENSID, orthologs$GeneID)
idx = orthologs$HumanSymbol[idx]
xx = !is.na(orthologs$HumanSymbol[idx]) # sum(logik) returns 17,365 homologous genes
logik = xx[logik]
xx = orthologs[!is.na(orthologs$HumanSymbol), ]
orthologs # note: several of these genes are not unique mappers; we will aggregate them later or make them
# unique. To aggregate, where E is the sparse expression matrix with rownames set to xx: E =
# Matrix.utils::aggregate.Matrix(E, row.names(E))
Now we have mapped homologous gene symbols across species:
head(orthologs)
## GeneID cynoSymbol HumanSymbol HumanGeneID
## 1 ENSMFAG00000046426 ND6 MT-ND6 ENSG00000198695
## 2 ENSMFAG00000002805 POLG2 POLG2 ENSG00000256525
## 3 ENSMFAG00000046418 COX2 MT-CO2 ENSG00000198712
## 4 ENSMFAG00000042657 SLC38A3 SLC38A3 ENSG00000188338
## 5 ENSMFAG00000042891 HMOX1 HMOX1 ENSG00000100292
## 6 ENSMFAG00000038079 SHISA9 SHISA9 ENSG00000237515
After mapping homologous genes, Signac can be used to classify the cell types.
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] compiler_4.0.3 magrittr_2.0.1 formatR_1.7 htmltools_0.5.1.1
## [5] tools_4.0.3 yaml_2.2.1 stringi_1.5.3 rmarkdown_2.6
## [9] knitr_1.30 stringr_1.4.0 digest_0.6.27 xfun_0.20
## [13] rlang_0.4.10 evaluate_0.14