# plotGODESeq
#
# Copyright (C) 2018 Nicolas Le Novère
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
# This function aims at integrating Gene Ontology overepresentation
# analysis and differential gene expression.
# The initial idea came from the function GOBubble in the package
# GOplot by Wencke Walter
#######################################################################
## Replaces the function from the package SDMTool no longer maintained
##
### ORIGINAL DOCUMENTATION
#' Legend Gradient
#'
#' \code{legend.gradient} creates and displays a gradient legend on a plot or
#' image file. The place and size of the legend is defined by coordinates,
#' previously identified.
#'
#'
#' @param pnts x and y coordinates of the gradient location in the plot
#' @param cols a set of 2 or more colors used in the image, to create the
#' gradient
#' @param limits to label the min and max values of the gradient in the legend
#' @param title to specify the title of the legend
#' @param ... other graphical parameters defined by image() or plot()
#' @return nothing is returned, a gradient legend is added to a plot or a
#' image.
#' @author Lorena Falconi \email{lorefalconi@@gmail.com}
#' @examples
#'
#'
#' #define a simple binary matrix
#' tmat = { matrix(c( 0,0,0,1,0,0,1,1,0,1,
#' 0,0,1,0,1,0,0,0,0,0,
#' 0,1,NA,1,0,1,0,0,0,1,
#' 1,0,1,1,1,0,1,0,0,1,
#' 0,1,0,1,0,1,0,0,0,1,
#' 0,0,1,0,1,0,0,1,1,0,
#' 1,0,0,1,0,0,1,0,0,0,
#' 0,1,0,0,0,1,0,NA,NA,NA,
#' 0,0,1,1,1,0,0,NA,NA,NA,
#' 1,1,1,0,0,0,0,NA,NA,NA),nr=10,byrow=TRUE) }
#'
#' #do the connected component labeling
#' tasc = ConnCompLabel(tmat)
#'
#' # Create a color ramp
#' colormap=c("grey","yellow","yellowgreen","olivedrab1","lightblue4")
#'
#' #create an image
#' image(tasc,col=colormap, axes=FALSE, xlab="", ylab="", ann=FALSE)
#'
#' #points for the gradient legend
#' pnts = cbind(x =c(0.8,0.9,0.9,0.8), y =c(1.0,1.0,0.8,0.8))
#'
#' #create the gradient legend
#' legend.gradient(pnts,colormap,c("Low","High"))
#'
#'
#' @export legend.gradient
#'
##### END ORIGINAL DOCUMENTATION
legend.gradient = function(pnts,cols=heat.colors(100),limits=c(0,1), title='Legend', ...){
pnts = try(as.matrix(pnts),silent=T)
if(!is.matrix(pnts)) stop("you must have a 4x2 matrix")
if(dim(pnts)[1]!=4 || dim (pnts)[2]!=2) stop ("Matrix must have dimensions of 4 rows and 2 columms")
if(length(cols)<2) stop("You must have 2 or more colors")
#break up the min and max into a number of values == length(cols)
yvals = seq(min(pnts[, 2]), max(pnts[, 2]), length=length(cols)+1)
#cycle through each of the yvals and create polygons
for (i in 1:length(cols)){ #create the polygon for that color
polygon(x=pnts[,1],y=c(yvals[i],yvals[i],yvals[i+1],yvals[i+1]),col=cols[i],border=F)
}
#add the text
text(max(pnts[,1]),min(pnts[,2]),labels=limits[1],pos=4,...)
text(max(pnts[,1]),max(pnts[,2]),labels=limits[2],pos=4,...)
text(min(pnts[,1]),max(pnts[,2]),labels=title,adj=c(0,-1),...)
}
plotGODESeq <- function(goenrich,
deseq,
maxFDR,
collapse,
color,
lowCol,
midCol,
highCol,
scale,
label,
maxFDRLab,
minZscoreLab,
extrawidth,
centered,
fixed_ymax,
leghoffset,
legvoffset,
wrap){
###### Input:
###
# label: label of the bubbles. Can take values of "id" (default) or "description"
# char
# NB: Needs to be checked before goenrich, because value is used in checking goenrich
if( missing(label) ){
label = "id"
cat("No label scheme was specified. GO term IDs will be used.\n")
}
if( !( label %in% c("id","description") ) ){
label = "id"
cat("The specified label scheme was neither \"id\" nor \"description\". GO IDs will be used.\n")
}
###
# goenrich: result of over-representation analysis. The format is based on
# dataframe(
# no rownames
# ID : GO term ID (Factor)
# description : GO term description (character. Hopefully without ",")
# Enrich : ratio between observed and expected annotated genes (numeric)
# PValue : significance of the over-representation, from hypergeometric test (numeric) NOT STRICTLY NEEDED AT THE MOMENT
# FDR : False Discovery Rate, a.k.a adjusted p-value, normally obtained using Benjamini-Hochberg correction (numeric)
# genes : list of genes annotated by the GO term, separated with semicolumns (Factor)
# )
# What to do if no ORA input?
if( missing(goenrich) ){
stop("No Gene Ontology enrichment dataset provided.")
}
if( label == "id" & !( "ID" %in% colnames(goenrich) ) ){
stop("Gene Ontology enrichment dataset does not provide term IDs, necessary to label the bubbles.")
}
if( label == "description" & !( "description" %in% colnames(goenrich) ) ){
stop("Gene Ontology enrichment dataset does not provide term descriptions, necessary to label the bubbles.")
}
if( !( "Enrich" %in% colnames(goenrich) ) ){
stop("Gene Ontology enrichment dataset does not provide a column \"Enrich\" with term enrichments, necessary for bubble size.")
}
# THE FOLLOWING CODE IS NOT NEEDED AT THE MOMENT. KEPT FOR FUTURE USE
# if( !( "PValue" %in% colnames(goenrich) ) ){
# stop("Gene Ontology enrichment dataset does not provide non corrected significance of term enrichment, necessary for ???.")
# }
# FDR
if( !( "FDR" %in% colnames(goenrich) ) ){
stop("Gene Ontology enrichment dataset does not provide a column \"FDR\" with False Discovery Rates (a.k.a. adjusted P-values). They are necessary for plot Y axis.")
}
# genes
if( !( "genes" %in% colnames(goenrich) ) ){
stop("Gene Ontology enrichment dataset does not provide a column \"genes\" with the list of genes annotated by each GO term. This is necessary for computing the zscores used as X axis, and the mean differential expression, used to color bubbles.")
}
###
# color: color of the bubbles. Can take values of "zscore" (default) or "l2fc"
# char
# NB: Needs to be checked before deseq, because its value is used when checking deseq
if( missing(color) ){
color = "zscore"
cat("No color scheme was specified. Zscores will be used.\n")
}
if( !( color %in% c("zscore","l2fc") ) ){
color = "zscore"
cat("The specified color scheme was neither \"zscore\" nor \"l2fc\". Zscores will be used.\n")
}
###
# deseq: result of DESeq2 analysis. We only use two type of information
# dataframe(
# rownames: Gene symbols
# log2FoldChange: Extent of the differential expression for each gene
# )
# What to do if no DESeq input?
if( missing(deseq) ){
stop("No differential expression dataset provided.")
}
# L2FC
if( color == "l2fc" & !( "log2FoldChange" %in% colnames(deseq) ) ){
stop("Differential Expression dataset does not provide a column \"log2FoldChange\" necessary to compute the plot color.")
}
###
# maxFDR: maximum value of the ORA False Discovery Rate that we consider for the plot
# numeric
if( missing(maxFDR) ){
maxFDR = 0.05
cat(paste("No maximum FDR provided for plotting. Default of", maxFDR,"will be used.\n"))
}
###
# collapse: either FALSE or value between 0 and 1, representing the proportion of genes in common for two GO terms to be merged.
if( missing(collapse) ){ collapse = FALSE
cat("No maximum GO term collapsing level is provided. No collapse will be performed.\n")
}
###
# lowCol: color of the bubbles for the low "color" values
# valid values for colors, i.e. "blue", "#aabbcc" etc.
if( missing(lowCol) ){ lowCol = "darkcyan"}
###
# midCol: color of the bubbles for the medium "color" values
# valid values for colors, i.e. "blue", "#aabbcc" etc.
if( missing(midCol) ){ midCol = "yellow"}
###
# highCol: color of the bubbles for the high "color" values
# valid values for colors, i.e. "blue", "#aabbcc" etc.
if( missing(highCol) ){ highCol = "darkred"}
###
# scale: scale the radius the bubbles.
# numeric
if( missing(scale) ){ scale = 1 }
###
# maxFDRLab: maximum Padj value at which bubbles are labelled
# numeric
if( missing(maxFDRLab) ){
maxFDRLab = maxFDR
cat(paste("No maximum FDR provided for labelling. Default of",maxFDRLab,"will be used.\n"))
}
###
# minZscoreLab: minimum absolute zscore value at which bubbles are labelled
# numeric
if( missing(minZscoreLab) ){
minZscoreLab = 0
cat(paste("No minimum zscore provided for labelling. Default of",minZscoreLab,"will be used.\n"))
}
###
# extrawidth: width added to the left of min(zscore) and right of max(zscore)
# if the "centered" is set to TRUE, it is added on both side to max(abs(min(zscore)),abs(max(zscore)))
# numeric
if( missing(extrawidth) ){ extrawidth = 1 }
###
# centered: decides if the plot is symmetrical
# Boolean
if ( missing(centered) ){ centered = FALSE }
###
# centered: decides if the plot is symmetrical
# Boolean
if ( missing(fixed_ymax) ){ fixed_ymax = -1 }
###
# value added to the horizontal position of the legend
# numeric
if( missing(leghoffset)){leghoffset = 0 }
###
# value added to the vertical position of the legend
# numeric
if( missing(legvoffset)){ legvoffset = 0 }
###
# wrap: width of label's lines
# numeric
if ( missing(wrap) ){ wrap = 15 }
##############################
# loaging required packages
##############################
library(GOplot) # necessary for the function collapsing GO terms that overlap
library(tidyr) # necessary for the function separate_rows
# library(SDMTools) # necessary for the function legend.gradient
# Not maintained anymore. The function is now included in this script
##############################
## Preparation of the data
##############################
# Replace the p-values and FDR that are zero so that they can be plotted on log-scale.
# # PVALUE IS NOT USED AT THE MOMENT. KEEP FOR FUTURE USE
# minpval <- min(goenrich[goenrich$PValue>0,]$PValue)
# # replace the 0 FDR by a tenth of the minimal ones
# goenrich$PValue[which(goenrich$PValue==0)] <- minpval/10
if (nrow(goenrich) == 0){
stop("No GO term in the enrichment file.")
} else {
print(paste("Nb of GO terms in input file: ",nrow(goenrich)))
}
# Get the minimal non-0 FDR
minfdr <- min(goenrich[goenrich$FDR>0,]$FDR)
# replace the 0 FDR by a tenth of the minimal ones
goenrich$FDR[which(goenrich$FDR==0)] <- minfdr/10
# Prepare a subset of the GO enrichment. NB always use this subset, even if it is 100% of the entire results.
goenrich_subset <- subset(goenrich, goenrich$FDR < maxFDR)
if (nrow(goenrich_subset) == 0){
stop("No remaining GO terms in the enrichment file. Try increasing the value of the maxFDR argument.")
} else {
print(paste("Nb of GO terms after FDR threshold: ",nrow(goenrich_subset)))
}
### Compute zscores, i.e. relative over or underexpression of Genes annotated by each term.
getZscore <- function(goterm) {
# get the list off all genes annotated by a term
genes<-strsplit(as.character(unlist(goterm)),";")
# using the DESeq results, get the number of genes that are upregulated or downregulated
up <- nrow(deseq_data[rownames(deseq_data) %in% unlist(genes)&deseq$log2FoldChange>=0,])
down <- nrow(deseq_data[rownames(deseq_data) %in% unlist(genes)&deseq$log2FoldChange<0,])
# compute the Zscore
zscore <- (up - down)/sqrt(up+down)
}
# Get the Zscore for each GO term
resZscore<-lapply(goenrich_subset$genes,getZscore)
# add zscore to the GO enrichment table
goenrich_subset$zscore <- unlist(resZscore)
### Compute average gene expression enrichment
getMeanL2FC <- function(goterm) {
# get the list off all genes annotated by a term
genes<-unlist(strsplit(as.character(unlist(goterm)),";"))
checkGeneDESeq <- function(gene){
if ( !( gene %in% rownames(deseq_data) ) ){
cat(paste("gene",gene,"is in the GO enrichment table but not in the list of differential expression\n"))
}
}
lapply(genes,checkGeneDESeq)
# compute the mean of L2FC for all the genes annotated by the term
meanL2FC <- mean(deseq_data[rownames(deseq_data) %in% genes,]$log2FoldChange)
}
# Get the mean L2FC for each GO term
resL2FC<-lapply(goenrich_subset$genes,getMeanL2FC)
# add average gene expression enrichment to the GO enrichment table
goenrich_subset$meanL2FC <- unlist(resL2FC)
# rename the FDR into adj_pval and description into term to be compatible with the GOPlot package
colnames(goenrich_subset)[colnames(goenrich_subset) %in% c("description","FDR")] <- c("term","adj_pval")
# explodes each GO term row into as many rows as genes involved
# This is necessary for collapsing terms using the package GOPlot
# separate_rows is a function of the package tidyr
goenrich_expand <- separate_rows(goenrich_subset, genes)
##############################
## Plot the data
##############################
## Use GOPlot package to merge together terms annotating similar genesets
if ( !(collapse == FALSE) ){
enrich_red <- reduce_overlap(goenrich_expand, overlap = collapse)
print(paste("Nb of GO terms after collapsing): ",nrow(enrich_red)))
} else { enrich_red = goenrich_subset } # goenrich_expand is only useful is we collapse.
####### Prepare colours
# build the low palette
rcPallow <- colorRampPalette(c(lowCol,midCol))
# build the high palette
rcPalhigh <- colorRampPalette(c(midCol,highCol))
# FIXME: DEAL WITH THE CASE WHEN ALL THE ZSCORE OR L2FC ARE EITHER POSITIVE OR NEGATIVE
####### Colors based on zscores
if (color == "zscore"){
print(paste("Chosen color for the bubble is: ",color))
# build df to contain all zscores and colors
datcol <- data.frame(zscore = enrich_red$zscore,colour=NA)
# Extract zscores under 0
if ( (nrow(enrich_red[enrich_red$zscore < 0,]) == 0) ){
cat("No negative zscore, only positive color scale is used.\n")
downreg <- 0
} else {
downreg <- enrich_red[enrich_red$zscore < 0,]$zscore
}
# Extract zscores above 0
if ( (nrow(enrich_red[enrich_red$zscore > 0,]) == 0) ){
cat("No strictly positive zscore, only negative color scale is used.\n")
upreg <- 0
} else {
upreg <- enrich_red[enrich_red$zscore >= 0,]$zscore
}
# Compute the number of breaks
maxzscore <- max(abs(downreg),abs(upreg))
if ( !(nrow(enrich_red[enrich_red$zscore < 0,]) == 0) ){
# I want 10 colors max on each side
ndownbreak = ceiling(10* max(abs(downreg))/maxzscore)
# Build negative part of color arrays using the zscore.
datcollow <- rcPallow(ndownbreak)[as.numeric(cut(downreg,breaks = ndownbreak))]
# Populate the colours
datcol[datcol$zscore < 0,]$colour <- datcollow
}
if ( !(nrow(enrich_red[enrich_red$zscore > 0,]) == 0) ){
# I want 10 colors max on each side
nupbreak = ceiling(10* max(abs(upreg))/maxzscore )
# Build positive part of color arrays using the zscore.
datcolhigh <- rcPalhigh(nupbreak)[as.numeric(cut(upreg,breaks = nupbreak))]
# Populate the colours
datcol[datcol$zscore >= 0,]$colour <- datcolhigh
}
}
####### Colors based on L2FC
if (color == "l2fc"){
print(paste("Chosen color for the bubble is: ",color))
# build df to contain all L2FC and colors
datcol <- data.frame(L2FC = enrich_red$meanL2FC,colour=NA)
# Extract L2FC under 0
if ( (nrow(enrich_red[enrich_red$meanL2FC < 0,]) == 0) ){
cat("No negative meanL2FC, only positive color scale is used.\n")
downreg <- 0
} else {
downreg <- enrich_red[enrich_red$meanL2FC < 0,]$meanL2FC
}
# Extract L2FC above 0
if ( (nrow(enrich_red[enrich_red$meanL2FC > 0,]) == 0) ){
cat("No strictly positive L2FC, only negative color scale is used.\n")
upreg <- 0
} else {
upreg <- enrich_red[enrich_red$meanL2FC >= 0,]$meanL2FC
}
# Compute the number of breaks
maxl2fc <- max(abs(downreg),abs(upreg))
if ( !(nrow(enrich_red[enrich_red$meanL2FC < 0,]) == 0) ){
# I want 10 colors max on each side
ndownbreak = ceiling(10* max(abs(downreg))/maxl2fc)
# Build negative part of color arrays using the L2FC
datcollow <- rcPallow(ndownbreak)[as.numeric(cut(downreg,breaks = ndownbreak))]
# Populate the colours
datcol[datcol$L2FC < 0,]$colour <- datcollow
}
if ( !(nrow(enrich_red[enrich_red$meanL2FC > 0,]) == 0) ){
# I want 10 colors max on each side
nupbreak = ceiling(10* max(abs(upreg))/maxl2fc )
# Build positive part of color arrays using the zscore.
datcolhigh <- rcPalhigh(nupbreak)[as.numeric(cut(upreg,breaks = nupbreak))]
# Populate the colours
datcol[datcol$L2FC >= 0,]$colour <- datcolhigh
}
}
####### create width and height of plot
if(centered == TRUE){
xmin = -max(abs(min(enrich_red$zscore)),abs(max(enrich_red$zscore))) - extrawidth
xmax = max(abs(min(enrich_red$zscore)),abs(max(enrich_red$zscore))) + extrawidth
} else {
xmin = min(enrich_red$zscore) - extrawidth
xmax = max(enrich_red$zscore) + extrawidth
}
ymin = min(-log10(enrich_red$adj_pval)) - 0.5
if(fixed_ymax != -1){
cat("Manual ymax entered: ",fixed_ymax,"\n")
ymax = fixed_ymax
} else {
ymax = max(-log10(enrich_red$adj_pval)) + 0.5
}
###### Plot!
# FIXME: PROVIDE AN OPTION TO USE THE RADIUS INSTEAD OF SURFACE FOR THE ENRICHMENT
symbols(enrich_red$zscore,
-log10(enrich_red$adj_pval),
circle = sqrt(enrich_red$Enrich/pi)*scale, # gives the surface in fonction of enrichment.
inches=FALSE,
fg="white",
bg = datcol$colour,
xlim=c(xmin,xmax),ylim=c(ymin,ymax),
xlab="zscore",ylab="-log(adjusted p-value GO enrichment)"
)
###### Create the Legend
# Create a color ramp
rbPal <- colorRampPalette(c(lowCol,midCol,highCol))
legcol <- rbPal(50)
#points for the gradient legend
pnts = cbind(x =c(xmin+0.5+leghoffset,xmin+0.75+leghoffset,xmin+0.75+leghoffset,xmin+0.5+leghoffset),
y =c(ymin+2.5+legvoffset,ymin+2.5+legvoffset,ymin+1+legvoffset,ymin+1+legvoffset))
#create the gradient legend
if (color == "l2fc") {
legend.gradient(pnts,
legcol,
c(sprintf("%.2f",min(enrich_red$meanL2FC)),sprintf("%.2f",max(enrich_red$meanL2FC))),
title = "mean Expr L2FC")
} else {
legend.gradient(pnts,
legcol,
c(sprintf("%.2f",min(enrich_red$zscore)),sprintf("%.2f",max(enrich_red$zscore))),
title = "Zscore")
}
# Legend for enrichment: black circle of surface "1".
# NB: I have to feed a vector of 1 element for the size to work. Don't ask. Symbols is crazy
symbols(c(xmin+0.5+leghoffset+0.125), # move by 0.125 because gradient is 0.25 large
c(ymin+3.5+legvoffset),
circle = c(sqrt(1/pi))*scale,
inches=FALSE,
fg="white",
bg = "black",
add = TRUE
)
text(xmin+0.5+leghoffset+0.125+sqrt(1/pi)*scale,ymin+3.5+legvoffset,label="Enrich = 1",pos=4)
# Select the bubbles that will be labeled.
# We only select adj p-value less than maxFDRLab and zscore inferieur to -minZscoreLab or superieur to minZscoreLab
tolabel <- subset(enrich_red, enrich_red$adj_pval < maxFDRLab | abs(enrich_red$zscore)>minZscoreLab)
if (label == "id"){
####### GO ID as labels
text(enrich_red$zscore,
-log10(enrich_red$adj_pval),
enrich_red$ID,
cex=0.75
)
}
if (label == "description"){
####### GO description as labels
## Build wrapped label
# Core wrapping function
wrap.it <- function(x, len)
{
sapply(x, function(y) paste(strwrap(y, len),
collapse = "\n"),
USE.NAMES = FALSE)
}
# Call this function with a list or vector
wrap.labels <- function(x, len)
{
if (is.list(x))
{
lapply(x, wrap.it, len)
} else {
wrap.it(x, len)
}
}
labels <- wrap.labels(tolabel$term, wrap)
text(tolabel$zscore,
-log10(tolabel$adj_pval),
labels,
cex=0.75
)
}
} ### END OF MAIN FUNCTION