Creating custom pathway annotations or gene sets
In this vignette, we show you how to create and use your own custom pathway annotations or gene sets with pagoda.
GO annotations
# Use the org.Hs.eg.db package for GO annotations
library(org.Hs.eg.db)
# Translate gene names to ids
ids <- unlist(lapply(mget(rownames(cd), org.Hs.egALIAS2EG, ifnotfound = NA), function(x) x[1]))
# Reverse map
rids <- names(ids)
names(rids) <- ids
# Convert ids per GO category to gene names
go.env <- eapply(org.Hs.egGO2ALLEGS, function(x) as.character(na.omit(rids[x])))
go.env <- clean.gos(go.env) # Remove GOs with too few or too many genes
go.env <- list2env(go.env) # Convert to an environment
# Test
class(go.env)
## [1] "environment"
head(ls(go.env)) # Look at gene set names
## [1] "GO:0000002" "GO:0000003" "GO:0000012" "GO:0000014" "GO:0000018"
## [6] "GO:0000022"
head(get(ls(go.env)[1], go.env)) # Look at one gene set
## [1] "SLC25A4" "DNA2" "TYMP" "LIG3" "LIG3" "MEF2A"
BioMart
Alternatively, we can use Ensembl's BioMart service to get the GO annotations.
library(biomaRt)
library(GO.db)
# Initialize the connection to the Ensembl BioMart Service
# Available datasets can be listed with
# listDatasets(useMart("ENSEMBL_MART_ENSEMBL", host="www.ensembl.org"))
# Use mmusculus_gene_ensembl for mouse
ensembl <- useMart("ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl", host="www.ensembl.org")
# Constructs a dataframe with two columns: hgnc_symbol and go_id
# If rownames are Ensembl IDs, use ensembl_gene_id as filter value
go <- getBM(attributes = c("hgnc_symbol", "go_id"), filters = "hgnc_symbol", values = rownames(cd), mart = ensembl)
# Use the GO.db library to add a column with the GO-term to the dataframe
go$term <- Term(go$go_id)
# Create a named list of character vectors out of the df
s = split(go$hgnc_symbol, paste(go$go_id,go$term))
# Saves the list as a R environment
go.env <- list2env(s)
# Test
class(go.env)
## [1] "environment"
head(ls(go.env)) # Look at gene set names
## [1] " NA"
## [2] "GO:0000002 mitochondrial genome maintenance"
## [3] "GO:0000003 reproduction"
## [4] "GO:0000009 alpha-1,6-mannosyltransferase activity"
## [5] "GO:0000010 trans-hexaprenyltranstransferase activity"
## [6] "GO:0000012 single strand break repair"
head(get(ls(go.env)[1], go.env)) # Look at one gene set
## [1] "HLA-DOB" "CLDN6" "YPEL5" "DYNLRB1" "LRRC41" "RSPH3"
From GMT
The GMT file format is a tab delimited file format that describes gene sets. GMT files for Broad's MSigDB and other gene sets can be downloaded from the Broad Website.
## read in Broad gmt format
library(GSA)
filename <- 'https://raw.githubusercontent.com/JEFworks/genesets/master/msigdb.v5.0.symbols.gmt'
gs <- GSA.read.gmt(filename)
## number of gene sets
n <- length(gs$geneset.names)
## create environment
env <- new.env(parent=globalenv())
invisible(lapply(1:n,function(i) {
genes <- as.character(unlist(gs$genesets[i]))
name <- as.character(gs$geneset.names[i])
assign(name, genes, envir = env)
}))
go.env <- env
# Test
class(go.env)
## [1] "environment"
head(ls(go.env)) # Look at gene set names
## [1] "3_5_CYCLIC_NUCLEOTIDE_PHOSPHODIESTERASE_ACTIVITY"
## [2] "3_5_EXONUCLEASE_ACTIVITY"
## [3] "AAACCAC,MIR-140"
## [4] "AAAGACA,MIR-511"
## [5] "AAAGGAT,MIR-501"
## [6] "AAAGGGA,MIR-204,MIR-211"
head(get(ls(go.env)[1], go.env)) # Look at one gene set
## [1] "PDE3B" "PDE4D" "PDE3A" "PDE10A" "PDE4C" "PDE7B"