## install the package
install.packages("devtools") # if you don't have devtools installed
library(devtools) 
install_github("BgeeDB/BgeeDB_R", build_vignettes=FALSE)
library("BgeeDB")

## Load expression calls, and the UBERON ontology ##################
## The loadTopAnatData() function loads a mapping from genes to anatomical structures based on calls of expression in anatomical structures. It also loads the structure of the anatomical ontology and the names of anatomical structures.
## Here is the command to load all mouse RNA-Seq data in post-embryonic developmental stage. 
## The computations will be faster than when using all Bgee data
myTopAnatData <- loadTopAnatData(species=10090, datatype="rna_seq", stage="UBERON:0000092")
## To retrieve all mouse data the command would have been
## myTopAnatData <- loadTopAnatData(species=10090)
## This is long, because lots of data! It is quicker if we restrict the query, for example using RNA-seq data only:
## myTopAnatData <- loadTopAnatData(species=10090, datatype="rna_seq")
## or if we restrict to one particular developmental stage:
## myTopAnatData <- loadTopAnatData(species=10090, datatype="rna_seq", stage="UBERON:0000068")
## or if we restrict to high quality expression calls only:
## myTopAnatData <- loadTopAnatData(species=10090, datatype="rna_seq", stage="UBERON:0000068", confidence="high_quality")
## See ?loadTopAnatData for potential options

## Look at the data
lapply(myTopAnatData, head, n=2)

## Upload and format a list of foreground and background genes ##################
## In this example we will look at mouse genes annotated with "spermatogenesis" in the Gene Ontology (retrieved using the biomaRt package).
source("https://bioconductor.org/biocLite.R")
biocLite("biomaRt")
library(biomaRt)
ensembl <- useMart("ensembl")
ensembl <- useDataset("mmusculus_gene_ensembl", mart=ensembl)
## Foreground genes are those with a GO annotation "spermatogenesis"
myGenes <- getBM(attributes= "ensembl_gene_id", filters=c("go_id"), values=list(c("GO:0007283")), mart=ensembl)
## Background are all genes with a GO annotation
universe <- getBM(attributes= "ensembl_gene_id", filters=c("with_go_go"), values=list(c(TRUE)), mart=ensembl)
## If you wanted to use all genes with data in Bgee as universe, you would do: 
## universe <- factor(as.integer(names(myTopAnatData$gene2anatomy) %in% myGenes[,1]))
## names(universe) <- names(myTopAnatData$gene2anatomy) 
## Prepares the gene list vector 
geneList <- factor(as.integer(universe[,1] %in% myGenes[,1]))
names(geneList) <- universe[,1]
head(geneList)
summary(geneList == 1)
## Warning: be careful with the choice of background genes since this can bias the whole analysis. Although it is tempting to consider the whole set of genes in teh genome as background, this is usually not a wise choice.

## Prepare a topGO object allowing to perform enrichment test for anatomical terms ##################
## First we need to prepare a list of genes in the foreground and in the background. The input format is the same as the gene list required to build a topGOdata object in the topGO package: a vector with background genes as names, and 0 or 1 values depending if a gene is in the foreground or not.
## Here anatomical structure is tested if a minimum of 20 genes are annotated to it (after propagation) 
myTopAnatObject <-  topAnat(myTopAnatData, geneList, nodeSize=20)
## Warning: This can be long, especially if the gene list is large, since the expression calls are propagated through the whole ontology

## Launch the enrichment test for anatomical terms ##################
## For this step, see the vignette of the topGO package for more details, as we directly use the methods implemented in this package
## For example: using a classical fisher/hypergeometric test
results <- runTest(myTopAnatObject, algorithm = 'classic', statistic = 'fisher')
## You can also use the topGO decorrelation methods to get less redundant results, for example the "weight" method
## results <- runTest(myTopAnatObject, algorithm = 'weight', statistic = 'fisher')
## Again this can be long because of the size of the ontology

## Format the table of results ################## 
## Results are sorted by p-value, and FDR values are calculated.
## Display results sigificant at a 1% FDR threshold
tableOver <- makeTable(myTopAnatData, myTopAnatObject, results, 0.2)
## It can be instructive to sort results by their fold enrichment
tableOver <- tableOver[order(tableOver$foldEnrichment, decreasing=T), ]

write.table(tableOver, file="results_spermatogenesis.txt", sep="\t", quote=F, row.names=F, col.names=T)
## Warning: it is debated whether FDR correction is appropriate on enrichment test results, since tests on different terms of the ontologies are not independent. A nice discussion can be found in the vignette of the topGO package.

