diff --git a/2024/evaiin1/SingleCell/DEA_GSEA/DEA.metadata.json b/2024/evaiin1/SingleCell/DEA_GSEA/DEA.metadata.json new file mode 100644 index 0000000..5871cfe --- /dev/null +++ b/2024/evaiin1/SingleCell/DEA_GSEA/DEA.metadata.json @@ -0,0 +1,43 @@ +{ + "name": "SingleCell_DEA.Rmd", + "abstract": "Learn how to perform DEA (Differential Expression Analysis) based on a scaled, normalized, and filtered Seurat object", + "author": [ + { + "@type": "Person", + "name": "Thibault Dayris", + "email": "thibault.dayris@gustaveroussy.fr" + } + ], + "contributor": [ + { + "@type": "Person", + "name": "Emilie Drouineau", + "email": "emilie.drouineau@cea.fr" + } + ], + "description": "This tutorial let you see all function IO and contains the full verbatim of the EBAII 2023 session", + "audiance": { + "@type": "Audiance", + "audienceType": "beginner command line biologist" + }, + "educationalLevel": "Beginner", + "inLanguage": [ + "en-GB" + ], + "teaches": [ + "The student will be able to load Seurat object", + "The student will be able to select a DEA method", + "The student will be able to select a Seurat count slot", + "The student will be able to compose the command line themselves", + "The student will be albe to draw basic graphs", + "The student will be able to save results as CSV" + ], + "keywords": "Seurat, R, Differential Expression Analysis", + "learningResourceType": [ + "handout" + ], + "license": [ + "https://mit-license.org/" + ], + "dateModified": "2023-10-25T13:43:27,142911245+02:00" +} \ No newline at end of file diff --git a/2024/evaiin1/SingleCell/DEA_GSEA/GSEA.metadata.json b/2024/evaiin1/SingleCell/DEA_GSEA/GSEA.metadata.json new file mode 100644 index 0000000..4b54f7f --- /dev/null +++ b/2024/evaiin1/SingleCell/DEA_GSEA/GSEA.metadata.json @@ -0,0 +1,43 @@ +{ + "name": "SingleCell_GSEA.Rmd", + "abstract": "Learn how to perform GSEA (Gene Set Enrichment Analysis) based on a scaled, normalized, and filtered Seurat object", + "author": [ + { + "@type": "Person", + "name": "Thibault Dayris", + "email": "thibault.dayris@gustaveroussy.fr" + } + ], + "contributor": [ + { + "@type": "Person", + "name": "Emilie Drouineau", + "email": "emilie.drouineau@cea.fr" + } + ], + "description": "This tutorial let you see all function IO and contains the full verbatim of the EBAII 2023 session", + "audiance": { + "@type": "Audiance", + "audienceType": "beginner command line biologist" + }, + "educationalLevel": "Beginner", + "inLanguage": [ + "en-GB" + ], + "teaches": [ + "The student will be able to load Seurat object", + "The student will be able to select a GSEA method", + "The student will be able to select a Seurat count slot", + "The student will be able to compose the command line themselves", + "The student will be albe to draw basic graphs", + "The student will be able to save results as CSV" + ], + "keywords": "Seurat, R, Gene Set Enrichment Analysis", + "learningResourceType": [ + "handout" + ], + "license": [ + "https://mit-license.org/" + ], + "dateModified": "2023-10-25T13:43:27,142911245+02:00" +} \ No newline at end of file diff --git a/2024/evaiin1/SingleCell/DEA_GSEA/Makefile b/2024/evaiin1/SingleCell/DEA_GSEA/Makefile new file mode 100644 index 0000000..52607dd --- /dev/null +++ b/2024/evaiin1/SingleCell/DEA_GSEA/Makefile @@ -0,0 +1,29 @@ + +SHELL=/usr/bin/bash +.ONESHELL: +.SHELLFLAGS := -euic +.DELETE_ON_ERROR: +MAKEFLAGS += --warn-undefined-variables +MAKEFLAGS += --no-builtin-rules + +.PHONY: +all: SingleCell_GSEA.html SingleCell_DEA.html + +SingleCell_GSEA.html: ebaii SingleCell_DEA.html + mamba activate ./ebaii && \ + R -e 'rmarkdown::render("SingleCell_GSEA.Rmd")' + + +SingleCell_DEA.html: ebaii Scaled_Normalized_Seurat_Object.RDS + mamba activate ./ebaii && \ + R -e 'rmarkdown::render("SingleCell_DEA.Rmd")' + +Scaled_Normalized_Seurat_Object.RDS: + wget https://nextcloud.gustaveroussy.fr/s/p8Ab8Be43xFogYN/download/Scaled_Normalized_Seurat_Object.RDS + +ebaii: + mamba env create -p ./ebaii -f environment.yaml + +.PHONY: +clean: + rm -rf ./ebaii *.RDS diff --git a/2024/evaiin1/SingleCell/DEA_GSEA/SingleCell_DEA.Rmd b/2024/evaiin1/SingleCell/DEA_GSEA/SingleCell_DEA.Rmd new file mode 100644 index 0000000..b633255 --- /dev/null +++ b/2024/evaiin1/SingleCell/DEA_GSEA/SingleCell_DEA.Rmd @@ -0,0 +1,966 @@ +--- +title: "
I need three teams for this session: team Wilcoxon
,
+team Student
,
+and team ROC
.
I will also need your help because I can’t make DESeq2 +work correctly. But I’m sure, that we will solve my issue: you’re in the +best session here at EBAII.
+In this presentation, there will be screen captures for you to follow +the lesson. There will also be every single R command lines. Do not take +care of the command lines if you find them too challenging. Our goal +here, is to understand the main mechanism of Differential Expression +Analysis. R is just a tool.
+Below are the libraries we need to perform this whole session:
+::library(package = "BiocParallel") # Optionally multithread some steps
+ base::library(package = "DT") # Display nice table in HTML
+ base::library(package = "ggplot2") # Draw graphs and plots
+ base::library(package = "ggpubr") # Draw nicer graphs
+ base::library(package = "rstatix") # Base R statistics
+ base::library(package = "knitr") # Build this presentation
+ base::library(package = "dplyr") # Handle big tables
+ base::library(package = "Seurat") # Handle SingleCell analyses
+ base::library(package = "SeuratObject") # Handle SingleCell objects
+ base::library(package = "SingleCellExperiment") # Handle SingleCell file formats
+ base::library(package = "UpSetR") # Nice venn-like graphs
+ base::library(package = "EnhancedVolcano") # Draw Volcano plot base
First, we load Seurat object:
+<- base::readRDS(
+ sobj # Path to the RDS file
+ file = "/shared/projects/ebaii_sc_teachers/SC_TD/06_Integration/RESULTS/12_TD3A.TDCT_S5_Integrated_12926.3886.RDS"
+ )
Then we join layers:
+::Idents(sobj) <- sobj$orig.ident
+ Seurat<- SeuratObject::JoinLayers(sobj) sobj
Then we perform differential expression analysis:
+<- Seurat::FindMarkers(
+ sobj_de # Object on which to perform DEA
+ object = sobj,
+ # Name of factor in condition 1
+ ident.1 = "TD3A",
+ # Name of factor in condition 2
+ ident.2 = "TDCT"
+ )
And that’s all. Our goal is to understand these lines, being able to +write them is a bonus.
+Up to now, we have:
+We would like to identify the list of genes that characterize +differences between cell cycle phases G1 and S groups.
+At the end of this session you will know:
+You already have a dataset loaded ? Good. Keep on going with it ! You +don’t have one ? Use mine:
+# The function `readRDS` from package `base`.
+<- base::readRDS(
+ sobj # Path to the RDS file
+ file = "/shared/projects/ebaii_sc_teachers/SC_TD/06_Integration/RESULTS/12_TD3A.TDCT_S5_Integrated_12926.3886.RDS"
+ )
The command above, uses the function readRDS
+from base
R package. and git it the path to the RDS
+file we built in previous session.
We are wondering what’s in our dataset. Let’s have a look, with the
+function print
+form the package base
.
::print(
+ base# Object to display
+ x = sobj
+ )
An object of class Seurat
+12926 features across 3886 samples within 1 assay
+Active assay: RNA (12926 features, 2000 variable features)
+ 5 layers present: counts.TD3A, counts.TDCT, data.TD3A, data.TDCT, scale.data
+ 6 dimensional reductions calculated: pca, CCAIntegration, umap, RPCAIntegration, HarmonyIntegration, HarmonyStandalone
+We have 12508 features (aka genes), across 2018 samples +(aka cells) in the TD3A condition. We have 12254 features +(aka genes), across 1868 samples (aka cells) in the +TD3A condition.
+Let us have a look at the RNA counts for 10 cells and their
+annotation, with the function head
+from the package utils
.
::head(
+ utils# Object to visualize
+ x = sobj,
+ # Number of lines to display
+ n = 10
+ )
There is no counts, normalized or not. Where are they ?
+In order to explore the content of sobj
, use the
+function str
from the package utils
:
::str(
+ utils# Object to explore
+ object = sobj@assays
+ )
Alternatively, in RStudio, you can click on the object pane and
+explore manually the content of the object. If we explore the slot
+assays
, then we find the counts.
You can access them with:
+::head(
+ utils# Object to visualize
+ x = SeuratObject::GetAssayData(object = sobj, assay = "RNA", layer = "data.TD3A"),
+ # Number of rows to display
+ n = 10
+ )
We have one gene per line, one cell per column, and RNA counts in +each row.
+For the sake of this session, we won’t compare the whole dataset. It +would take up to 15 minutes to complete. During the rest of this +session, we will compare clusters 8 and 10, as annotated with +Harmony.
+We need to re-annotate layers to do so. This is done in two +steps:
+Idents
+in order to be able to call cells by their cluster names.JoinLayers
+to have a single count table with both TD3A and TDCT cells from clusters
+8 and 10 together.::Idents(sobj) <- sobj$HarmonyStandalone_clusters
+ Seurat<- SeuratObject::JoinLayers(sobj) sobj
Let’s check if our cells are joint:
+::print(sobj) base
An object of class Seurat
+12926 features across 3886 samples within 1 assay
+Active assay: RNA (12926 features, 2000 variable features)
+ 3 layers present: data, counts, scale.data
+ 6 dimensional reductions calculated: pca, CCAIntegration, umap, RPCAIntegration, HarmonyIntegration, HarmonyStandalone
+We have one gene per line, one cell per column, and RNA counts in +each row. Are these counts normalized ? Are they scaled ? Are they +filtered ? Are they corrected ?
+These counts are normalized, scaled, filtered. This information is
+available in the seurat object itself, within the slot
+commands
. See an example below:
names(sobj@commands)
[1] "NormalizeData.RNA"
+ [2] "FindVariableFeatures.RNA"
+ [3] "ScaleData.RNA"
+ [4] "RunPCA.RNA"
+ [5] "RunUMAP.RNA.CCAIntegration"
+ [6] "FindNeighbors.RNA.CCAIntegration"
+ [7] "RunUMAP.RNA.RPCAIntegration"
+ [8] "FindNeighbors.RNA.RPCAIntegration"
+ [9] "RunUMAP.RNA.HarmonyIntegration"
+[10] "FindNeighbors.RNA.HarmonyIntegration"
+[11] "RunUMAP.RNA.HarmonyStandalone"
+[12] "FindNeighbors.RNA.HarmonyStandalone"
+[13] "FindClusters"
+However, please be aware that counts in the slot
+count
are raw counts. Normalized counts are in the slot
+data
and scaled data are in the slot
+scaled.data
. And it you do not find that clear, I totally
+agree with you.
Is it normal that we have so many zeros ? And what about surch low +counts, is it because we downsampled the sequenced reads for this +session ?
+The large number of null counts is completely normal. In maths/stats +we talk about matrix sparcity, i.e a table with lots +of zeros. If the data were to be downsampled, we had had done this by +cropping over a small chromosome, and not reducing the general amount of +reads.
+Seurat let
+us use multiple differential analysis methods with its function FindMarkers
.
The main question now is how to choose the right +test: spoilers, there are no option better than another in all +ways.
+From Soneson & Robinson (2018) Nature Methods:
+Here, researchers have designed an artificial dataset where they knew +in advance the list of differentially expressed genes. They have used +all these algorithms and consigned the results.
+ANOVA was not present in this study.
+The question is now to guess whether this gene is differnetially +expressed or not.
+Let’s have a look at the gene named ‘Plac8’,
+involved in positive regulation of cold-induced thermogenesis and
+positive regulation of transcription by RNA polymerase II. In order to
+plot its expression across all cells, we are going to use the function
+VlnPlot
+from Seurat
package. The input object is obviously
+contained in the sobj
variable, since it is our only Seurat
+object. In addition, we are going to select the feature
+Plac8
, and split the graph according to the clusters we
+annotated earlier.
::VlnPlot(
+ Seurat# A subset of the Seurat object
+ # limited to clusters 8 and 10,
+ # or else we will plot all the clusters
+ object = subset(sobj, HarmonyStandalone_clusters %in% c("8", "10")),
+ # The name of the gene of interest (feature = gene)
+ features = "Plac8",
+ # The name of the Seurat cell annotation
+ split.by = "HarmonyStandalone_clusters",
+ # Change color for presentation
+ cols = c("darkslategray3", "olivedrab")
+ )
Using your ‘intuition’, is this gene differentially +expressed between cluster 8 and cluster 10 ?
+In cluster 10, the violin plot highlights almost no cells with low or
+zero Plac8
expression. The highest density of cells has a
+Plac8
normalized expression aroung 1.5.
In Cluster 8, cells seems to have no expression of Plac8
+at all.
IMHO, and you can disagree, the expression of the gene
+Plac8
differs between cluster 8 and cluster 10. This is
+purely intuitive.
Using your ‘intuition’, is this gene differentially +expressed between G1 and S phases ?
+::VlnPlot(
+ Seurat# The Seurat object
+ object = sobj,
+ # The name of the gene of interest (feature = gene)
+ features = "Plac8",
+ # The name of the Seurat cell-cycle annotation
+ group.by = "CC_Seurat_Phase",
+ # Change color for presentation
+ cols = c("darkslategray3", "olivedrab", "orangered3")
+ )
Most of the expression is null, some cells express the gene.
+Okay, let’s have some informations about these distributions.
+# Store counts in a variable
+<- base::as.data.frame(
+ counts # The matrix to reformat into a dataframe
+ x = SeuratObject::GetAssayData(object = sobj, assay = "RNA", layer = "data")
+
+ )# Rename cells with cell harmony cluster
+::colnames(counts) <- paste(
+ base# The names of the cell cluster for each cell
+ $CC_Seurat_Phase,
+ sobj# The names of the cells themselves
+ colnames(sobj),
+ sep = "_"
+ )
We have 2838 cells within the G1 group:
+<- select(counts, matches("^G1."))
+ countsG1
+<- base::as.data.frame(base::t(countsG1["Plac8", ]))
+ Plac8G1
+::summary(Plac8G1) base
Plac8
+ Min. :0.00000
+ 1st Qu.:0.00000
+ Median :0.00000
+ Mean :0.04823
+ 3rd Qu.:0.00000
+ Max. :3.48145
+We have 711 cells withing the S group:
+<- select(counts, matches("^S."))
+ countsS
+<- base::as.data.frame(base::t(countsS["Plac8", ]))
+ Plac8S
+::summary(Plac8S) base
Plac8
+ Min. :0.0000
+ 1st Qu.:0.0000
+ Median :0.0000
+ Mean :0.2097
+ 3rd Qu.:0.0000
+ Max. :3.5292
+Okay, let us resort on statistics to evaluate our chances to be guess +correctly, or our risks to guess wrong.
+We have lots of observations: 2838 cells within the G1 phase, and 711 +cells withing the S phase.Statisticians really like to have a lot of +observations! Ideally, statisticians always want to have more +observation than tests. We have a total of 3549 observations and we are +testing 1 gene. For them, this is a dream come true!
+Are our cells supposed to be interacting with each others ? Are they +independent from each others ? This is very important, and usually, it +requires a discussion.
+oes the expression in our cells follow a normal distribution ? It’s +easy to check. Let’s draw the expression like we did above.
+First, we use the function rbind
+from base
package. Be careful, the function
+rbind
also exists in DelayedArray
,
+data.table
, and BiocGenerics
packages. We want
+to use the basic one. Here is an example of odd behaviors that occurs
+when not using the package name before a function call.
# Add column idientifiers in each count dataframes
+$phase <- "G1"
+ Plac8G1$phase <- "S"
+ Plac8S# Paste the rows beneith each other
+<- base::rbind(
+ Plac8 # variable pointing to G1 counts
+
+ Plac8G1,# variable pointing to S counts
+
+ Plac8S,# A Boolean, requesting that strings/characters
+ # should not be casted as `logical`. It breaks graphs.
+ stringsAsFactors = FALSE
+ )
Secondly, we use the function gghistogram
+from package ggpubr
in order to display relative abundance
+of gene expression:
::gghistogram(
+ ggpubr
+ Plac8,x = "Plac8",
+ y = '..density..',
+ fill = "steelblue",
+ bins = 15,
+ add_density = TRUE
+ )
Our distribution doesn’t seems to be normal, nor binomial we will +have to rely on non-parametric tests.
+Let’s run a non-parametric test base on the mean of distributions, +since it’s the clothest to our ‘intuitive’ approach. Let there be Wilcoxon +test.
+In R, it’s quite straightforward: we have the function wilcoxon_test
+to perform the test, then we can plot the result.
# On the expression table stored in the varialbe `Plac8`,
+# first apply the function `wilcox_test` from package `rstatix`,
+# then we apply the function `add_significance` from package `rstatix`
+<- Plac8 %>% rstatix::wilcox_test(Plac8 ~ phase) %>% rstatix::add_significance()
+ stat.test # While doing so, we usually also compute effect size
+# eff.size <- Plac8 %>% rstatix::wilcox_effsize(Plac8 ~ phase)
Wilcoxon test says: the distributions are different, with a 3^{-13} % +of risks of being wrong. The gene Plac8 can safely be said +differentially expressed. We can compute a fold change and conclude.
+Just out of curiosity, be aware that t_test
+from rstatix
package, gives the same answer. However, DESeq
+gives a p-value of 0.05 and an adjusted p-value equal to 1.
Depending on the test used, this gene is, or is not differentially +expressed.
+In conclusion, to select your method, use the following:
+Please, never use a simple Wilcoxon on bulk RNA-Seq +data.
+There it is quite easier:
+ + +With the function FindMarkers
+from package Seurat
, we want to make three groups:
wilcoxon
to perform DEA between “G1” and “S”
+phases.t
-test to perform DEA between “G1” and “S”
+phases.ROC
to perform DEA between “G1” and “S”
+phases.We will observe the results and compare our gene lists.
+Hey, why are you looking at me? It’s your turn to work! Use the all
+the notions seen above to select the right counts (slot
),
+the right input object, and the right arguments.
10 minutes practice !
+Here are the code for each team:
+<- Seurat::FindMarkers(
+ sobj_wilcox # The variable that contains Seurat Object
+ object = sobj,
+ # Name of condition 1
+ ident.1 = "8",
+ # Name of condition 2
+ ident.2 = "10",
+ # Factor name in the Seurat Object
+ group.by = "HarmonyStandalone_clusters",
+ # Differential analysis method
+ test.use = "wilcox"
+
+ )
+<- Seurat::FindMarkers(
+ sobj_t # The variable that contains Seurat Object
+ object = sobj,
+ # Name of condition 1
+ ident.1 = "8",
+ # Name of condition 2
+ ident.2 = "10",
+ # Factor name in the Seurat Object
+ group.by = "HarmonyStandalone_clusters",
+ # Differential analysis method
+ test.use = "t"
+
+ )
+<- Seurat::FindMarkers(
+ sobj_roc # The variable that contains Seurat Object
+ object = sobj,
+ # Name of condition 1
+ ident.1 = "8",
+ # Name of condition 2
+ ident.2 = "10",
+ # Factor name in the Seurat Object
+ group.by = "HarmonyStandalone_clusters",
+ # Differential analysis method
+ test.use = "roc"
+ )
In the function argument, there is a FoldChange threshold. Should we +filter gene expression based on FoldChange? In case of positive answer, +how much should that threshold be?
+About thresholds on FDR (False Discovery Rate) and Log2(FC) (Log of +the Fold Change), there are many discussions.
+Remember, here the threshold on Fold Change is Logged. A
+log2(1) =
0. And keep in mind the following:
In conclusion, there one, and only one reason to filter on fold +change: in my experience, a fold change below 0.7 will be hard to +see/verify on wet-lab (qRT).
+If you need to reduce a too large number of differentially expressed +genes, then reduce the FDR to 0.01, or even better, to 0.001. With that, +you reduce your number of false claims.
+Can you help me with DEseq2
?
When I run the following command line, I have an error :
+<- Seurat::FindMarkers(
+ sobj_deseq2 # The variable that contains Seurat Object
+ object = sobj,
+ # Name of condition 1
+ ident.1 = 8,
+ # Name of condition 2
+ ident.2 = 10,
+ # Factor name in the Seurat Object
+ group.by = "HarmonyStandalone_clusters",
+ # Differential analysis method
+ test.use = "deseq2",
+ # Use non-normalized data with DESeq2
+ slot = "counts"
+ )
++Error in PerformDE(object = object, cells.1 = cells.1, cells.2 = +cells.2, : Unknown test: deseq2
+
Oh! My fault, it was a typo in my command! Thank you all for your +help!
+<- Seurat::FindMarkers(
+ sobj_deseq2 # The variable that contains Seurat Object
+ object = sobj,
+ # Name of condition 1
+ ident.1 = "8",
+ # Name of condition 2
+ ident.2 = "10",
+ # Factor name in the Seurat Object
+ group.by = "HarmonyStandalone_clusters",
+ # Differential analysis method
+ test.use = "DESeq2",
+ # Use non-normalized data with DESeq2
+ slot = "counts"
+ )
Remark: by doing surch modification, some fold changes have been +modified: remember the gene Atad2 with a mean expression of 0.08 in G1, +and 0.2 in S phases? Mean expressions are now around 1.08 for G1, and +1.2 for S phases. This may be the reason why it was not differentially +expressed in DESeq2, while Wilcoxon and T-test returned adjusted pvalues +far below 0.05.
+Let us have a look at the results:
+ + +We have the following columns:
+p_val
: Ignore this column. Always ignore raw p-values.
+Look at corrected ones, and if they are missing, then compute them.avg_log2FC
: Average Log2(FoldChange). Illustrates how
+much a gene is differentially expessed between samples in each
+condition.pct.1
: Percent of cells with gene expression in
+condition one, here in “G1” phase.pct.2
: Percent of cells with gene expression in
+condition two, here in “S” phase.p_val_adj
: The confidence we have in the result. The
+closer to 0, the lesser is the risk of error.We have the following columns:
+p_val
: Ignore this column. Always ignore raw p-values.
+Look at corrected ones, and if they are missing, then compute them.avg_log2FC
: Average Log2(FoldChange). Illustrates how
+much a gene is differentially expessed between samples in each
+condition.pct.1
: Percent of cells with gene expression in
+condition one, here in “G1” phase.pct.2
: Percent of cells with gene expression in
+condition two, here in “S” phase.p_val_adj
: The confidence we have in the result. The
+closer to 0, the lesser is the risk of error.We have the following columns:
+myAUC
: The area under the curveavg_diff
: Average difference in AUCpower
: abs(AUC-0.5) * 2
, usefull to sort
+genes based on AUCpct.1
: Percent of cells with gene expression in
+condition one, here in “G1” phase.pct.2
: Percent of cells with gene expression in
+condition two, here in “S” phase.We have the following columns:
+p_val
: Ignore this column. Always ignore raw p-values.
+Look at corrected ones, and if they are missing, then compute them.avg_log2FC
: Average Log2(FoldChange). Illustrates how
+much a gene is differentially expessed between samples in each
+condition.pct.1
: Percent of cells with gene expression in
+condition one, here in “G1” phase.pct.2
: Percent of cells with gene expression in
+condition two, here in “S” phase.p_val_adj
: The confidence we have in the result. The
+closer to 0, the lesser is the risk of error.What kind of threshold should be used to filter each results?
+++If we must label certain scores as good or bad, we can reference the +following rule of thumb:
+0.5 = No discrimination 0.5-0.7 = Poor discrimination 0.7-0.8 = +Acceptable discrimination 0.8-0.9= Excellent discrimination 0.9 = +Outstanding discrimination
+
Hosmer and Lemeshow in Applied Logistic Regression (p. 177)
+We’d like to store the results of differential expression analysis in
+the Seurat
object.
@misc$wilcox <- sobj_wilcox sobj
Now we can plot intersections in an up-set graph. It is like a venn +diagramm:
+::upset(
+ UpSetRdata = UpSetR::fromList(data),
+ order.by = "freq"
+ )
We’d like to display the expression of genes identified by
+FindMarkers. Then we use the function DoHeatmap
+from the package Seurat
.
In order to limit the graph to differentially expressed reads, we use
+the function rownames
+from R base
package on the DEA result table. In this
+example, I use the results of wilcoxon, but you shall use any of the
+results you previously obtained.
::DoHeatmap(
+ Seurat# variable pointing to seurat object
+ object = sobj,
+ # name of DE genes
+ features = base::rownames(sobj_wilcox),
+ # Cluster annotation
+ group.by = "HarmonyStandalone_clusters",
+ )
A Volcano plot is usefull to identify differnetial expression +analysis bias.
+The package EnhancedVolcano
has an eponym
+function for that:
::EnhancedVolcano(
+ EnhancedVolcano# variable pointing to the DEA results
+ toptable = sobj_wilcox,
+ # Gene names
+ lab = rownames(sobj_wilcox),
+ # Column in which to find Fold Change
+ x = "avg_log2FC",
+ # Column in which to find confidence interval
+ y = "p_val_adj",
+ # Lower fold-change cut-off
+ FCcutoff = 0.2
+ )
This list of all packages used while you work should be included in +each en every R presentation:
+::sessionInfo() utils
R version 4.4.1 (2024-06-14)
+Platform: x86_64-conda-linux-gnu
+Running under: Ubuntu 20.04.6 LTS
+
+Matrix products: default
+BLAS/LAPACK: /shared/ifbstor1/software/miniconda/envs/r-4.4.1/lib/libopenblasp-r0.3.27.so; LAPACK version 3.12.0
+
+locale:
+ [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
+ [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
+ [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
+ [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
+ [9] LC_ADDRESS=C LC_TELEPHONE=C
+[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
+
+time zone: Europe/Paris
+tzcode source: system (glibc)
+
+attached base packages:
+[1] stats4 stats graphics grDevices utils datasets methods
+[8] base
+
+other attached packages:
+ [1] EnhancedVolcano_1.22.0 ggrepel_0.9.6
+ [3] UpSetR_1.4.0 SingleCellExperiment_1.26.0
+ [5] SummarizedExperiment_1.34.0 Biobase_2.64.0
+ [7] GenomicRanges_1.56.2 GenomeInfoDb_1.40.1
+ [9] IRanges_2.38.1 S4Vectors_0.42.1
+[11] BiocGenerics_0.50.0 MatrixGenerics_1.16.0
+[13] matrixStats_1.4.1 Seurat_5.1.0
+[15] SeuratObject_5.0.2 sp_2.1-4
+[17] dplyr_1.1.4 knitr_1.48
+[19] rstatix_0.7.2 ggpubr_0.6.0
+[21] ggplot2_3.5.1 DT_0.33
+[23] BiocParallel_1.38.0
+
+loaded via a namespace (and not attached):
+ [1] RcppAnnoy_0.0.22 splines_4.4.1 later_1.3.2
+ [4] tibble_3.2.1 polyclip_1.10-7 rpart_4.1.23
+ [7] fastDummies_1.7.4 lifecycle_1.0.4 globals_0.16.3
+ [10] lattice_0.22-6 MASS_7.3-61 crosstalk_1.2.1
+ [13] backports_1.5.0 magrittr_2.0.3 limma_3.60.6
+ [16] Hmisc_5.1-3 plotly_4.10.4 sass_0.4.9
+ [19] rmarkdown_2.28 jquerylib_0.1.4 yaml_2.3.10
+ [22] httpuv_1.6.15 sctransform_0.4.1 spam_2.11-0
+ [25] spatstat.sparse_3.1-0 reticulate_1.39.0 cowplot_1.1.3
+ [28] pbapply_1.7-2 RColorBrewer_1.1-3 abind_1.4-8
+ [31] zlibbioc_1.50.0 Rtsne_0.17 purrr_1.0.2
+ [34] nnet_7.3-19 GenomeInfoDbData_1.2.12 irlba_2.3.5.1
+ [37] listenv_0.9.1 spatstat.utils_3.1-0 goftest_1.2-3
+ [40] RSpectra_0.16-2 spatstat.random_3.3-2 fitdistrplus_1.2-1
+ [43] parallelly_1.38.0 DelayedArray_0.30.1 leiden_0.4.3.1
+ [46] codetools_0.2-20 tidyselect_1.2.1 UCSC.utils_1.0.0
+ [49] farver_2.1.2 base64enc_0.1-3 spatstat.explore_3.3-2
+ [52] jsonlite_1.8.9 progressr_0.14.0 Formula_1.2-5
+ [55] ggridges_0.5.6 survival_3.7-0 tools_4.4.1
+ [58] ica_1.0-3 Rcpp_1.0.13 glue_1.8.0
+ [61] SparseArray_1.4.8 gridExtra_2.3 DESeq2_1.44.0
+ [64] xfun_0.48 withr_3.0.1 fastmap_1.2.0
+ [67] fansi_1.0.6 digest_0.6.37 R6_2.5.1
+ [70] mime_0.12 colorspace_2.1-1 scattermore_1.2
+ [73] tensor_1.5 spatstat.data_3.1-2 utf8_1.2.4
+ [76] tidyr_1.3.1 generics_0.1.3 data.table_1.16.2
+ [79] S4Arrays_1.4.1 httr_1.4.7 htmlwidgets_1.6.4
+ [82] uwot_0.2.2 pkgconfig_2.0.3 gtable_0.3.5
+ [85] lmtest_0.9-40 XVector_0.44.0 htmltools_0.5.8.1
+ [88] carData_3.0-5 dotCall64_1.2 scales_1.3.0
+ [91] png_0.1-8 spatstat.univar_3.0-1 rstudioapi_0.17.0
+ [94] reshape2_1.4.4 checkmate_2.3.1 nlme_3.1-165
+ [97] cachem_1.1.0 zoo_1.8-12 stringr_1.5.1
+[100] KernSmooth_2.23-24 vipor_0.4.7 parallel_4.4.1
+[103] miniUI_0.1.1.1 foreign_0.8-86 ggrastr_1.0.2
+[106] pillar_1.9.0 grid_4.4.1 vctrs_0.6.5
+[109] RANN_2.6.2 promises_1.3.0 car_3.1-3
+[112] xtable_1.8-4 cluster_2.1.6 beeswarm_0.4.0
+[115] htmlTable_2.4.2 evaluate_1.0.1 locfit_1.5-9.9
+[118] cli_3.6.3 compiler_4.4.1 crayon_1.5.3
+[121] rlang_1.1.4 future.apply_1.11.2 ggsignif_0.6.4
+[124] labeling_0.4.3 ggbeeswarm_0.7.2 plyr_1.8.9
+[127] stringi_1.8.4 viridisLite_0.4.2 deldir_2.0-4
+[130] munsell_0.5.1 lazyeval_0.2.2 spatstat.geom_3.3-3
+[133] Matrix_1.7-1 RcppHNSW_0.6.0 patchwork_1.3.0
+[136] future_1.34.0 statmod_1.5.0 shiny_1.9.1
+[139] highr_0.11 ROCR_1.0-11 igraph_2.1.1
+[142] broom_1.0.7 bslib_0.8.0
+In this presentation, there will be screen captures for you to follow +the lesson. There will also be every single R command lines. Do not take +care of the command lines if you find them too challenging. Our goal +here, is to understand the main mechanism of Differential Expression +Analysis. R is just a tool.
+Below are the libraries we need to perform this whole session:
+::library(package = "BiocParallel") # Optionally multithread some steps
+ base::library(package = "DT") # Display nice table in HTML
+ base::library(package = "ggplot2") # Draw graphs and plots
+ base::library(package = "ggpubr") # Draw nicer graphs
+ base::library(package = "rstatix") # Base R statistics
+ base::library(package = "knitr") # Build this presentation
+ base::library(package = "dplyr") # Handle big tables
+ base::library(package = "Seurat") # Handle SingleCell analyses
+ base::library(package = "SeuratObject") # Handle SingleCell objects
+ base::library(package = "SingleCellExperiment") # Handle SingleCell file formats
+ base::library(package = "escape") # Perform exploratory enrichment analysis
+ base::library(package = "clusterProfiler") # Perform GSEA analysis
+ base::library(package = "dittoSeq") # Draw nice plots based on Seurat
+ base::library(package = "org.Mm.eg.db") # Mouse genome annotation
+ base::library(package = "Cairo") # Graphs library
+ base::library(package = "pathview") # For the whole pathway graph base
First, we load Seurat object:
+<- base::readRDS(
+ sobj # Path to the RDS file
+ file = "DEA_Scaled_Normalized_Filtered.RDS"
+ )
Then we launch enrichment exploration on all counts:
+# Acquire gene sets
+<- escape::getGeneSets(
+ mh_hallmark species = "Mus musculus",
+ library = "H"
+
+ )
+# Run enrichment
+<- escape::escape.matrix(
+ enrichment_matrix input.data = sobj,
+ gene.sets = mh_hallmark,
+ method = "ssGSEA",
+ groups = 1000,
+ min.size = 5,
+ BPPARAM = BiocParallel::SnowParam(workers = 2),
+
+ )
+# Save enrichment in seurat object
+"escape"]] <- SeuratObject::CreateAssay5Object(
+ sobj[[data=t(enrichment_matrix),
+
+ )
+saveRDS(sobj, file = "sobj_GSEA.RDS")
Up to now, we have:
+We would like to identify the functions of genes among several +clusters, or differentially expressed genes.
+At the end of this session you will know:
+Let’s search information about this gene on the web. For mice, one of +the best web-site for human is: MGI.
+If we search for the gene Plac8, we +find the following:
+As we click on “Phenotype reference”, we can see that the MGI +database can give us PubMeb +IDs related to the gene Plca8. Below, on the same page, we can see +regulation pathways, protein ontology, gene interactions, etc.
+We illustrate here, that GSEA does not only include gene-to-function, +or gene-to-pathway. They usually include many more information.
+There is a database for pretty much everything you might want. From +pharmacology, to regulatory objects, etc.
+Our dataset contains gene expression. But the same methods can be +applied to other kind of datasets. E.g.
+Protein-protein interactions, drug interactions, SNP-interactions, +etc.
+Within R, you may find genome databases for a wide range of +organisms:
+For this session, we are going to use MSigBD.
+There are two main types of Gene Set Enrichment Analysis: 1. A blind +search among all genes in order to identify pathways or mechanisms among +cells. 1. A specific search through genes of interest
+The first is called “Exploratory” analysis and starts on the
+Seurat
object.
Many packages perform this kind of exploratory analysis, I chose to
+present escape
+for its simple and intuitive usage.
What really matters is the GSEA method used in back-end. There are
+several very well known methods: fgsea
,
+gsva
,
+DOSE
,
+and the MSigDB
’s
+tool. While choosing your favorite R package to perform GSEA, if that
+package uses one of these methods, then you should not have odd
+questions through your publication process.
These methods are very alike, based on the same statistical resorts, +and suffer for the same limitations. Let’s illustrate it with +examples.
+Up to now, we already define what’s a gene set: a group of
+genes of interest. We also know these king of relationships are stored
+in GMT
+files or TSV
+files.
Since the beginning of our week, we’ve been working with R, and the +question naturally comes: how to make/use gene sets in R?
+Very easily. We need a “named list
+of vectors”,
+all coming from base
R package.
# We use the function list from base package
+<- base::list(
+ pathways # We use the function `c` from base package
+ proliferation = base::c(
+ "Ifng", "Kras", "Mgat5",
+ "Prf1", "Trem1", "Trp53"
+
+ ),growth = base::c(
+ "Cdkn2a", "Fos", "Ifnb1",
+ "Rras", "Apc", "Sell"
+
+ )
+ )::head(pathways) utils
$proliferation
+[1] "Ifng" "Kras" "Mgat5" "Prf1" "Trem1" "Trp53"
+
+$growth
+[1] "Cdkn2a" "Fos" "Ifnb1" "Rras" "Apc" "Sell"
+This method is very useful when you want to compare your cells toward +unpublished data, gene sets from an article not sourced in official +databases, etc.
+We can also load a GMT file from a file. This is very usefull when we +expect both known and new pathways to be tested, or known pathways that +should be updated.
+This is done with the function read.gmt
+from the clusterProfiler
+package.
<- clusterProfiler::read.gmt(
+ pathways gmtfile = "m5.mpt.v2023.1.Mm.symbols.gmt"
+
+ )<- BiocGenerics::lapply(
+ pathways ::split(
+ S4Vectors-1], pathways$term
+ pathways[
+ ),function(x) x[x != ""]
+
+ )::head(pathways) utils
$MP_ABNORMAL_TUMOR_VASCULARIZATION
+ [1] "Adamts12" "Aggf1" "Amotl1" "Anxa1" "Arhgef4" "Cav2"
+ [7] "Ccm2l" "Cd82" "Dock4" "Fkbpl" "Gpr4" "H2ax"
+[13] "Idh2" "Ifngr1" "Il12rb2" "Il1a" "Il1b" "Itga6"
+[19] "Itgb3" "Jcad" "Mmrn2" "Myct1" "Ntn4" "Peak1"
+[25] "Pld1" "Rhoj" "Rras" "S1pr2" "Stard13"
+
+$MP_DECREASED_CIRCULATING_TUMOR_NECROSIS_FACTOR_LEVEL
+ [1] "Adam17" "Apba3" "Arid5a" "Bbs12" "C3" "Cd14"
+ [7] "Cd19" "Cd44" "Clic4" "Enpp2" "F3" "Ffar2"
+[13] "Foxn1" "Foxo1" "Gba" "Hectd3" "Ifi204" "Ifi35"
+[19] "Ifit2" "Ifngr1" "Ikbkb" "Il22" "Il6ra" "Irak2"
+[25] "Irak4" "Irf5" "Lacc1" "Lep" "Lrrc19" "Lrrc8e"
+[31] "Mapkapk2" "Mbl1" "Mcpt1" "Mif" "Msmp" "Mstn"
+[37] "Myd88" "Nfkbib" "Nkg7" "Nmi" "Npy1r" "Parp1"
+[43] "Peli1" "Pilrb1" "Plekhf1" "Prkce" "Rhbdf2" "Sgms1"
+[49] "Siglec1" "Snx8" "Sra1" "Thbd" "Ticam1" "Tlr2"
+[55] "Tnf" "Tradd" "Zdhhc1"
+
+$MP_DECREASED_INCIDENCE_OF_INDUCED_TUMORS
+ [1] "Ccng1" "Cd151" "Cdk1" "Cebpb" "Ct55" "Cyp2c23" "Eif2s2"
+ [8] "H2-Ob" "Il6" "Klk6" "Krt10" "Lrig2" "Pgr" "Terc"
+[15] "Uqcc3" "Zmiz1" "a"
+
+$MP_DECREASED_INCIDENCE_OF_TUMORS_BY_CHEMICAL_INDUCTION
+ [1] "Agtr2" "Ahrr" "Ar" "Arl6ip5" "Birc5" "Brca2"
+ [7] "Cacfd1" "Camk2g" "Ccl2" "Cd151" "Cd34" "Cd96"
+[13] "Cdkn1b" "Cdkn2a" "Cebpb" "Chd1l" "Cyp1a2" "Cyp1b1"
+[19] "Cyp2e1" "Dek" "Dnajc27" "Endov" "Ephx1" "Fgf22"
+[25] "Fntb" "Foxm1" "Foxn1" "Fxyd5" "Fzr1" "Gata6os"
+[31] "Hcst" "Hipk1" "Hras" "Ier3" "Il12b" "Il1r1"
+[37] "Il23a" "Il6" "Il6ra" "Itgb1" "Kdm4c" "Klk6"
+[43] "Lonp1" "Loxl2" "Lypd3" "Mapkapk2" "Mgat3" "Mif"
+[49] "Mir21a" "Mir301" "Mki67" "Mmp11" "Mmp19" "Mmp1a"
+[55] "Mpg" "Mtdh" "Muc4" "Myd88" "Neu3" "Pard3"
+[61] "Pik3ca" "Pla2g4a" "Plce1" "Prdx1" "Pten" "Ptger1"
+[67] "Ptger2" "Ptger4" "Ptgs2" "Ptk2" "Ptp4a3" "Ptprj"
+[73] "Pvr" "Pycard" "Ralgds" "Retnlb" "Skil" "Slc3a2"
+[79] "Spp1" "Stag1" "Stat3" "Strap" "Terc" "Tert"
+[85] "Tiam1" "Tlr4" "Tnf" "Tnfaip8l3" "Tnik" "Trem1"
+[91] "Trim27" "Trp53" "Uri1"
+
+$MP_DECREASED_LIVER_TUMOR_INCIDENCE
+[1] "Foxm1" "Mtdh" "Myd88" "Tert" "Tlr2"
+
+$MP_DECREASED_LUNG_TUMOR_INCIDENCE
+[1] "Ggta1" "Kras" "Lexm" "Met" "Mtdh" "Tlr2"
+We can also get pathways from the web using the function getGeneSets
+from package escape
.
# Pay attention to the genome name, it's case sensitive
+# and follows MSigDB naming scheme
+<- escape::getGeneSets(
+ mh_hallmark species = "Mus musculus",
+ library = "H"
+
+ )::head(pathways) utils
$MP_ABNORMAL_TUMOR_VASCULARIZATION
+ [1] "Adamts12" "Aggf1" "Amotl1" "Anxa1" "Arhgef4" "Cav2"
+ [7] "Ccm2l" "Cd82" "Dock4" "Fkbpl" "Gpr4" "H2ax"
+[13] "Idh2" "Ifngr1" "Il12rb2" "Il1a" "Il1b" "Itga6"
+[19] "Itgb3" "Jcad" "Mmrn2" "Myct1" "Ntn4" "Peak1"
+[25] "Pld1" "Rhoj" "Rras" "S1pr2" "Stard13"
+
+$MP_DECREASED_CIRCULATING_TUMOR_NECROSIS_FACTOR_LEVEL
+ [1] "Adam17" "Apba3" "Arid5a" "Bbs12" "C3" "Cd14"
+ [7] "Cd19" "Cd44" "Clic4" "Enpp2" "F3" "Ffar2"
+[13] "Foxn1" "Foxo1" "Gba" "Hectd3" "Ifi204" "Ifi35"
+[19] "Ifit2" "Ifngr1" "Ikbkb" "Il22" "Il6ra" "Irak2"
+[25] "Irak4" "Irf5" "Lacc1" "Lep" "Lrrc19" "Lrrc8e"
+[31] "Mapkapk2" "Mbl1" "Mcpt1" "Mif" "Msmp" "Mstn"
+[37] "Myd88" "Nfkbib" "Nkg7" "Nmi" "Npy1r" "Parp1"
+[43] "Peli1" "Pilrb1" "Plekhf1" "Prkce" "Rhbdf2" "Sgms1"
+[49] "Siglec1" "Snx8" "Sra1" "Thbd" "Ticam1" "Tlr2"
+[55] "Tnf" "Tradd" "Zdhhc1"
+
+$MP_DECREASED_INCIDENCE_OF_INDUCED_TUMORS
+ [1] "Ccng1" "Cd151" "Cdk1" "Cebpb" "Ct55" "Cyp2c23" "Eif2s2"
+ [8] "H2-Ob" "Il6" "Klk6" "Krt10" "Lrig2" "Pgr" "Terc"
+[15] "Uqcc3" "Zmiz1" "a"
+
+$MP_DECREASED_INCIDENCE_OF_TUMORS_BY_CHEMICAL_INDUCTION
+ [1] "Agtr2" "Ahrr" "Ar" "Arl6ip5" "Birc5" "Brca2"
+ [7] "Cacfd1" "Camk2g" "Ccl2" "Cd151" "Cd34" "Cd96"
+[13] "Cdkn1b" "Cdkn2a" "Cebpb" "Chd1l" "Cyp1a2" "Cyp1b1"
+[19] "Cyp2e1" "Dek" "Dnajc27" "Endov" "Ephx1" "Fgf22"
+[25] "Fntb" "Foxm1" "Foxn1" "Fxyd5" "Fzr1" "Gata6os"
+[31] "Hcst" "Hipk1" "Hras" "Ier3" "Il12b" "Il1r1"
+[37] "Il23a" "Il6" "Il6ra" "Itgb1" "Kdm4c" "Klk6"
+[43] "Lonp1" "Loxl2" "Lypd3" "Mapkapk2" "Mgat3" "Mif"
+[49] "Mir21a" "Mir301" "Mki67" "Mmp11" "Mmp19" "Mmp1a"
+[55] "Mpg" "Mtdh" "Muc4" "Myd88" "Neu3" "Pard3"
+[61] "Pik3ca" "Pla2g4a" "Plce1" "Prdx1" "Pten" "Ptger1"
+[67] "Ptger2" "Ptger4" "Ptgs2" "Ptk2" "Ptp4a3" "Ptprj"
+[73] "Pvr" "Pycard" "Ralgds" "Retnlb" "Skil" "Slc3a2"
+[79] "Spp1" "Stag1" "Stat3" "Strap" "Terc" "Tert"
+[85] "Tiam1" "Tlr4" "Tnf" "Tnfaip8l3" "Tnik" "Trem1"
+[91] "Trim27" "Trp53" "Uri1"
+
+$MP_DECREASED_LIVER_TUMOR_INCIDENCE
+[1] "Foxm1" "Mtdh" "Myd88" "Tert" "Tlr2"
+
+$MP_DECREASED_LUNG_TUMOR_INCIDENCE
+[1] "Ggta1" "Kras" "Lexm" "Met" "Mtdh" "Tlr2"
+Let’s time to practice. Use the function escape.matrix
+from the package escape
in order to
+perform GSEA on your Seurat
object. Perform
+the analysis on cell cycle phase if possible. Save the results in a
+variable, for example sobj_enrich
.
It is not possible to specify what group we are considering. +GSEA/Enrichment analysis do not perform any differential analysis.
+# Save in a variable called `sobj_enrich`
+# the result of the function `runEscape` from package `escape`
+<- escape::escape.matrix(
+ enrichment_matrix # Out Seurat object
+ input.data = sobj,
+ # Provide gene sets
+ gene.sets = mh_hallmark,
+ # Select your favorite method
+ method = "ssGSEA",
+ # ssGSEA parameters
+ groups = 1000,
+ min.size = 5,
+ # Use 2 threads/CPU/workers
+ BPPARAM = BiocParallel::SnowParam(workers = 2),
+ )
[1] "Using sets of 1000 cells. Running 4 times."
+[1] "Calculating ranks..."
+[1] "Calculating absolute values from ranks..."
+[1] "Calculating ranks..."
+[1] "Calculating absolute values from ranks..."
+[1] "Calculating ranks..."
+[1] "Calculating absolute values from ranks..."
+[1] "Calculating ranks..."
+[1] "Calculating absolute values from ranks..."
+If you have reserved more than one core, this function can be +multi-threaded, making it faster !
+We can visualise the gene sets enriched in cell clusters using the
+function FeaturePlot
+from Seurat
package. We just need to color the cells with
+ssGSEA
’s results:
::FeaturePlot(object = sobj, features = "HALLMARK-DNA-REPAIR") Seurat
In order to understand what happened, where do these results come +from, we need to go step by step and perform both enrichment and GSE +analysis manually.
+In general, the exploratory method is never used on a raw Seurat +object, we usually filter this object in order to search pathways that +are related to our biological question.
+With the previous method, what ever our biological question was, the +cell cycle phase was one of the top gene expression drivers.
+Let’s search information about the gene ARP2
.
So, this gene exists in multiple organisms. Thanks, we know we are +working on mice, and we told gene set retriever function to consider +‘mouse’ datasets.
+So in the Human genome, ARP2 refers to multiple genes through the
+alias names. When we searched for gene sets with [escape
],
+we did not perform any disambiguation.
The same with arabidopsis thaliana, ARP2 can refer to actin +related protein, or a root development protein.
+In mus musculus, ARP2 refers to an actin related protein on +chromosome 11, or a subunit adaptor protein on chromosome 5.
+We now understand that escape
analysis may be completely
+wrong since it used human-intelligible gene names. These names include
+synonyms, homonyms, within a single organism, or between multiple
+ones.
We need to use unique identifiers. These identifiers are called gene
+identifiers and we usually consider EnsemblID
+or EntrezID
.
For example:
+ENSMUSG00000020152
+at Ensembl.ENSMUSG00000019518
+at Ensembl.These identifiers are very unpleasant to use for human beings. On
+monday meeting, please talk about ARP2
and not
+ENSMUSG00000020152
. However, when machines are working,
+please given them EnsemblID
or EntrezID
.
Let’s translate all gene identifiers and evaluate possible number of +errors in escape analysis.
+First, let’s load the DEA results based on Wilcoxon analysis:
+# We load our differential expression analysis result
+# using the function `readRDS` from `base` package
+<- base::readRDS(file = "sobj_wilcox.RDS")
+ sobj_wilcox # Copy gene names in a column it makes the future operation easier
+$SYMBOL <- base::rownames(sobj_wilcox) sobj_wilcox
Then, we use the function bitr
+(standing for biological translator) from the package clusterProfiler
.
# We store the result of `bitr` function
+# from `clusterProfiler` package
+# in a variable called `annotation`
+<- clusterProfiler::bitr(
+ annotation # We provide the variable pointing to gene names
+ geneID = base::rownames(sobj_wilcox),
+ # We tell the translator to which gene identifiers
+ # translation must be done
+ toType = base::c("ENTREZID", "ENSEMBL"),
+ # We tell the translator which type of gene name we have
+ fromType = "SYMBOL",
+ # We provide mmu database
+ OrgDb = org.Mm.eg.db
+ )
Now we would like to have these annotation alongside with adjusted
+pvalues and fold change information. In order to do so, we use the
+function merge
from base
package. Not from Seurat
+package ! This is used to merger Seurat objects, but we have dataframes
+here!
# We filter these results on **ADJUSTED** pvalue
+<- sobj_wilcox[sobj_wilcox$p_val_adj <= 0.05, ]
+ sobj_wilcox # Use the function `merge` from package `base` in order
+# to add annotation to wixocon DEA result.
+<- base::merge(
+ sobj_wilcox # Variable pointing to the wilcoxon results
+ x = sobj_wilcox,
+ # Variable pointing to the annotation results
+ y = annotation,
+ # We tell how to merge sobj_wilcox
+ by.x = "SYMBOL",
+ # We tell how to merge annotation
+ by.y = "SYMBOL"
+ )
As we perform the analysis, we are going to provide a numeric +variable to the GSEA/Enrichment.
+We have the following columns:
+p_val
: Ignore this column. Always ignore raw p-values.
+Look at corrected ones, and if they are missing, then compute them.avg_log2FC
: Average Log2(FoldChange). Illustrates how
+much a gene is differentially expessed between samples in each
+condition.pct.1
: Percent of cells with gene expression in
+condition one, here in “G1” phase.pct.2
: Percent of cells with gene expression in
+condition two, here in “S” phase.p_val_adj
: The confidence we have in the result. The
+closer to 0, the lesser is the risk of error.Is it a good idea to use p_val
? What are the
+consequences ?
No. Never. Never ever use raw P-Value. It is never a good idea.
+Is it a good idea to use avg_log2FC
? What are the
+consequences ?
It is a good idea, we could answer biological questions like : +“Considering differentially expressed genes, what are the pathways with +highest expression change ?”
+Is it a good idea to use pct.1
? What are the
+consequences ?
It is a good idea, we could answer biological questions like : +“Considering differentially expressed genes, what are the expression of +genes in pathway XXX in the first condition ?”
+Is it a good idea to use pct.2
? What are the
+consequences ?
It is a good idea, we could answer biological questions like : +“Considering differentially expressed genes, what are the expression of +genes in pathway XXX in the second condition ?”
+Is it a good idea to use p_val_adj
? What are the
+consequences ?
It is a good idea, we could answer biological questions like : “Which +pathways are differentially expressed with highest confidence interval +?”
+But in order to perform surch test, use
+-log10(Adjusted P-Value)
instead of the raw adjusted
+p-value. Remember, 0 is a good confidence interval, and 1 a bad one. So
+we need the values close to 0 to be highly positive.
Previously, we used a function called enrichIt
. We
+talked about enrichment analysis, yet this talk is called GSEA.
This is due to the fact that both techniques exists and are +different.
+Enrichment tests a list of genes. Their expression, confidence +interval, etc. Nothing else matters than their name.
+For each cell, enrichIt
removes zeros and tests the list
+of remaining genes. Their expression does not matter. Their order in the
+gene list does not matter. Their impact on the pathway does not matter.
+Their differential expression status does not matter.
Behind the scenes, it’s a very simple tests answering the following +question: “Among expressed genes, what are the odds that it belongs to +the pathway XXX?”
+We can do enrichment tests on our wilxoc results, using the function
+enrichGO
+from the package clusterProfiler
:
# We store the result of the function `enrichGO` from package `clusterProfiler`
+# in the function `enrich_wilcox`
+<- clusterProfiler::enrichGO(
+ enrich_wilcox # We provide the list of gene identifiers to test
+ gene = sobj_wilcox$ENTREZID,
+ # We provide the complete list of genes we quantified
+ universe = annotation$ENTREZID,
+ # We provide mouse annotations
+ OrgDb = org.Mm.eg.db,
+ # We tell the function that we use entrez-identifiers
+ keyType = "ENTREZID",
+ # We search results on Biological Process"
+ ont = "BP",
+ # We want all the results
+ pvalueCutoff = 1,
+ # We are humans, we wan human-readable gene names
+ readable = TRUE
+ )
We can have a look at pathways including the word G2M
,
+using the functions with
and grepl
from
+base
package.
# Get the result table
+<- enrich_wilcox@result
+ results_enrich_wilcox
+# We store the result of the line selection
+# in a variable called `g2m_enrich`
+<- results_enrich_wilcox[
+ g2m_enrich # We select rows containing a given information
+ # using the function `with` from `base` package
+ ::with(
+ base# We provide the variable pointing to enrichment results
+ data = results_enrich_wilcox,
+ # We provide the term we are searching and the column in which
+ # the term should be searched
+ ::grepl("G2M", Description)
+ base# Do not forget his comma, since we are filtering a dataframe on rows
+ ), ]
GSEA tests the gene names, and uses the numeric value associated with +that gene in order to weight the results. It tests the name, the rank, +and the numeric value.
+First, we need to create a named list of gene weights. For this +example, we use the average fold change as weights.
+# We extract gene average fold change
+<- sobj_wilcox$avg_log2FC
+ gene_list # We extract genes Entrez ID
+::names(gene_list) <- sobj_wilcox$ENTREZID
+ base# We sort that list deacreasingly
+<- base::sort(gene_list, decreasing=TRUE)
+ gene_list
+# Check the results with the function `head` from `utils` package
+::head(gene_list) utils
12504 12259 12262 14960 16149 14969
+8.970173 8.696339 8.573637 8.452598 8.323054 8.257669
+And now, we can run the gseGO
+from clusterProfiler
.
+You’ll see, the interfacce is almost the same as for the enrichment:
# We use the function `gseGO` from the package `clusterProfiler`
+# and store the result in a variable called `gsea_wilcox`
+<- clusterProfiler::gseGO(
+ gsea_wilcox # We provide the variable pointing to named gene list
+ geneList = gene_list,
+ # We provide mouse annotations
+ OrgDb = org.Mm.eg.db,
+ # We tell the function that we use entrez-identifiers
+ keyType = "ENTREZID",
+ # We search results on Biological Process"
+ ont = "BP",
+ # We want all the results
+ pvalueCutoff = 1
+ )
Just like before, we can have a look at pathways including the word
+G2M
, using the functions with
and grepl
from
+base
package.
# Get the result table
+<- enrich_wilcox@result
+ results_gse_wilcox
+# We store the result of the line selection
+# in a variable called `g2m_enrich`
+<- results_gse_wilcox[
+ g2m_gse # We select rows containing a given information
+ # using the function `with` from `base` package
+ ::with(
+ base# We provide the variable pointing to enrichment results
+ data = results_gse_wilcox,
+ # We provide the term we are searching and the column in which
+ # the term should be searched
+ ::grepl("G2/M", Description)
+ base# Do not forget his comma, since we are filtering a dataframe on rows !
+ ),
+ ]
+# Check the results with the function `head` from `utils` package
+::head(g2m_gse) utils
ID
+GO:1902751 GO:1902751
+GO:0010971 GO:0010971
+GO:1902749 GO:1902749
+GO:0044839 GO:0044839
+GO:0000086 GO:0000086
+GO:0010389 GO:0010389
+ Description
+GO:1902751 positive regulation of cell cycle G2/M phase transition
+GO:0010971 positive regulation of G2/M transition of mitotic cell cycle
+GO:1902749 regulation of cell cycle G2/M phase transition
+GO:0044839 cell cycle G2/M phase transition
+GO:0000086 G2/M transition of mitotic cell cycle
+GO:0010389 regulation of G2/M transition of mitotic cell cycle
+ GeneRatio BgRatio pvalue p.adjust qvalue
+GO:1902751 3/394 23/9476 0.06825935 0.4259430 0.3891670
+GO:0010971 2/394 20/9476 0.20114319 0.6661140 0.6086015
+GO:1902749 5/394 90/9476 0.31953152 0.7945751 0.7259713
+GO:0044839 6/394 112/9476 0.32235419 0.7945751 0.7259713
+GO:0000086 5/394 97/9476 0.37773602 0.7945751 0.7259713
+GO:0010389 4/394 80/9476 0.42724795 0.7945751 0.7259713
+ geneID Count
+GO:1902751 Cdk4/Mta3/Npm1 3
+GO:0010971 Cdk4/Mta3 2
+GO:1902749 Babam2/Cdk4/Cdk6/Mta3/Npm1 5
+GO:0044839 Babam2/Calm1/Cdk4/Cdk6/Mta3/Npm1 6
+GO:0000086 Babam2/Calm1/Cdk4/Cdk6/Mta3 5
+GO:0010389 Babam2/Cdk4/Cdk6/Mta3 4
+
+
+This makes a lot of tables. Let’s make a lot of graphs !
+We can use the function barplot
+from package graphics
; and
+not the ones from any other package, or it won’t work !
# We use the function `barplot` from package `enrichplot`
+::barplot(
+ graphics# We provide the variable pointing to enrichment results
+ height = enrich_wilcox,
+ # We display the best 15 results
+ howCategory=15
+ )
We can also use the function ditplot
+from package enrichplot
;
+and not the ones from any other package, or it won’t work ! Note the
+change in the input parameter name:
# We use the function `barplot` from package `enrichplot`
+::dotplot(
+ enrichplot# We provide the variable pointing to enrichment results
+ object = enrich_wilcox,
+ # We display the best 15 results
+ showCategory=15
+ )
We can display traditional GSEA graph using gseaplot2
+from package enrichplot
:
# We use the function `barplot` from package `enrichplot`
+::gseaplot2(
+ enrichplot# We provide the variable pointing to enrichment results
+ x = gsea_wilcox,
+ # We display the best result
+ geneSetID = 1
+ )
With GSEA, you dot not test if a pathway is up or down regulated. A +pathway contains both enhancers and suppressors genes. An up-regulation +of enhancer genes and a down-regulation of suppressor genes will lead to +a “bad” enrichment score. However, this will lead to a strong change in +your pathway activity!
+If your favorite pathway does not have a “good enrichment score”, it +does not mean that pathway is not affected.
+The heatplot
+displays both a heatmap of enriched pathways and their genes in
+commons:
# We use the function `heatplot` from `enrichplot` package
+::heatplot(
+ enrichplot# We probide the variable pointing to GSEA results
+ x = gsea_wilcox,
+ # We show the 15 best results
+ showCategory = 15,
+ # We color according to FoldChange
+ foldChange = gene_list
+ )
The genes in common between multiple gene sets are also visible +through an uspet plot:
+# We use the function `upsetplot` from `enrichplot` package
+::upsetplot(
+ enrichplot# We probide the variable pointing to GSEA results
+ x = gsea_wilcox,
+ # We show the 10 best results
+ n = 10
+ )
Finally, we can display whole pathways using KEGG database:
+# We use the function pathview from pathview package
+::pathview(
+ pathviewgene.data = gene_list, # Our gene list
+ pathway.id = "mmu04110", # Our pathway
+ species = "mmu", # Our organism
+ # The color limits
+ limit = list(gene=max(abs(gene_list))),
+ gene.idtype = "ENTREZID" # The genes identifiers
+ )
This list of all packages used while you work should be included in +each en every R presentation:
+::sessionInfo() utils
R version 4.4.1 (2024-06-14)
+Platform: x86_64-conda-linux-gnu
+Running under: Ubuntu 20.04.6 LTS
+
+Matrix products: default
+BLAS/LAPACK: /shared/ifbstor1/software/miniconda/envs/r-4.4.1/lib/libopenblasp-r0.3.27.so; LAPACK version 3.12.0
+
+locale:
+ [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
+ [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
+ [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
+ [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
+ [9] LC_ADDRESS=C LC_TELEPHONE=C
+[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
+
+time zone: Europe/Paris
+tzcode source: system (glibc)
+
+attached base packages:
+[1] stats4 stats graphics grDevices utils datasets methods
+[8] base
+
+other attached packages:
+ [1] pathview_1.44.0 Cairo_1.6-2
+ [3] org.Mm.eg.db_3.19.1 AnnotationDbi_1.66.0
+ [5] dittoSeq_1.16.0 clusterProfiler_4.12.6
+ [7] escape_2.0.0 SingleCellExperiment_1.26.0
+ [9] SummarizedExperiment_1.34.0 Biobase_2.64.0
+[11] GenomicRanges_1.56.2 GenomeInfoDb_1.40.1
+[13] IRanges_2.38.1 S4Vectors_0.42.1
+[15] BiocGenerics_0.50.0 MatrixGenerics_1.16.0
+[17] matrixStats_1.4.1 Seurat_5.1.0
+[19] SeuratObject_5.0.2 sp_2.1-4
+[21] dplyr_1.1.4 knitr_1.48
+[23] rstatix_0.7.2 ggpubr_0.6.0
+[25] ggplot2_3.5.1 DT_0.33
+[27] BiocParallel_1.38.0
+
+loaded via a namespace (and not attached):
+ [1] SpatialExperiment_1.14.0 R.methodsS3_1.8.2
+ [3] GSEABase_1.66.0 nnet_7.3-19
+ [5] goftest_1.2-3 Biostrings_2.72.1
+ [7] HDF5Array_1.32.0 vctrs_0.6.5
+ [9] spatstat.random_3.3-2 digest_0.6.37
+ [11] png_0.1-8 ggrepel_0.9.6
+ [13] deldir_2.0-4 parallelly_1.38.0
+ [15] magick_2.8.5 MASS_7.3-61
+ [17] reshape2_1.4.4 httpuv_1.6.15
+ [19] qvalue_2.36.0 withr_3.0.1
+ [21] xfun_0.48 ggfun_0.1.6
+ [23] survival_3.7-0 memoise_2.0.1
+ [25] gson_0.1.0 tidytree_0.4.6
+ [27] zoo_1.8-12 KEGGgraph_1.64.0
+ [29] pbapply_1.7-2 ggdist_3.3.2
+ [31] R.oo_1.26.0 Formula_1.2-5
+ [33] KEGGREST_1.44.0 promises_1.3.0
+ [35] httr_1.4.7 globals_0.16.3
+ [37] fitdistrplus_1.2-1 rhdf5filters_1.16.0
+ [39] rhdf5_2.48.0 rstudioapi_0.17.0
+ [41] UCSC.utils_1.0.0 miniUI_0.1.1.1
+ [43] generics_0.1.3 DOSE_3.30.1
+ [45] base64enc_0.1-3 babelgene_22.9
+ [47] zlibbioc_1.50.0 ScaledMatrix_1.12.0
+ [49] ggraph_2.2.1 polyclip_1.10-7
+ [51] GenomeInfoDbData_1.2.12 SparseArray_1.4.8
+ [53] xtable_1.8-4 stringr_1.5.1
+ [55] evaluate_1.0.1 S4Arrays_1.4.1
+ [57] irlba_2.3.5.1 colorspace_2.1-1
+ [59] ROCR_1.0-11 reticulate_1.39.0
+ [61] spatstat.data_3.1-2 Rgraphviz_2.48.0
+ [63] magrittr_2.0.3 lmtest_0.9-40
+ [65] later_1.3.2 viridis_0.6.5
+ [67] ggtree_3.12.0 lattice_0.22-6
+ [69] spatstat.geom_3.3-3 future.apply_1.11.2
+ [71] scattermore_1.2 XML_3.99-0.17
+ [73] shadowtext_0.1.3 cowplot_1.1.3
+ [75] ggupset_0.4.0 RcppAnnoy_0.0.22
+ [77] Hmisc_5.1-3 pillar_1.9.0
+ [79] nlme_3.1-165 compiler_4.4.1
+ [81] beachmat_2.20.0 RSpectra_0.16-2
+ [83] stringi_1.8.4 tensor_1.5
+ [85] plyr_1.8.9 msigdbr_7.5.1
+ [87] crayon_1.5.3 abind_1.4-8
+ [89] gridGraphics_0.5-1 org.Hs.eg.db_3.19.1
+ [91] graphlayouts_1.1.1 bit_4.5.0
+ [93] fastmatch_1.1-4 codetools_0.2-20
+ [95] BiocSingular_1.20.0 crosstalk_1.2.1
+ [97] bslib_0.8.0 plotly_4.10.4
+ [99] mime_0.12 splines_4.4.1
+[101] Rcpp_1.0.13 fastDummies_1.7.4
+[103] sparseMatrixStats_1.16.0 HDO.db_0.99.1
+[105] blob_1.2.4 utf8_1.2.4
+[107] fs_1.6.4 listenv_0.9.1
+[109] checkmate_2.3.1 DelayedMatrixStats_1.26.0
+[111] GSVA_1.52.3 ggsignif_0.6.4
+[113] ggplotify_0.1.2 tibble_3.2.1
+[115] Matrix_1.7-1 tweenr_2.0.3
+[117] pkgconfig_2.0.3 pheatmap_1.0.12
+[119] tools_4.4.1 cachem_1.1.0
+[121] RSQLite_2.3.7 viridisLite_0.4.2
+[123] DBI_1.2.3 fastmap_1.2.0
+[125] rmarkdown_2.28 scales_1.3.0
+[127] grid_4.4.1 ica_1.0-3
+[129] broom_1.0.7 sass_0.4.9
+[131] patchwork_1.3.0 dotCall64_1.2
+[133] graph_1.82.0 carData_3.0-5
+[135] RANN_2.6.2 rpart_4.1.23
+[137] snow_0.4-4 farver_2.1.2
+[139] tidygraph_1.3.1 scatterpie_0.2.4
+[141] yaml_2.3.10 foreign_0.8-86
+[143] cli_3.6.3 purrr_1.0.2
+[145] UCell_2.8.0 leiden_0.4.3.1
+[147] lifecycle_1.0.4 uwot_0.2.2
+[149] backports_1.5.0 annotate_1.82.0
+[151] gtable_0.3.5 rjson_0.2.21
+[153] ggridges_0.5.6 progressr_0.14.0
+[155] parallel_4.4.1 ape_5.8
+[157] jsonlite_1.8.9 bitops_1.0-9
+[159] RcppHNSW_0.6.0 bit64_4.5.2
+[161] Rtsne_0.17 yulab.utils_0.1.7
+[163] spatstat.utils_3.1-0 BiocNeighbors_1.22.0
+[165] highr_0.11 jquerylib_0.1.4
+[167] GOSemSim_2.30.2 spatstat.univar_3.0-1
+[169] distributional_0.4.0 R.utils_2.12.3
+[171] lazyeval_0.2.2 shiny_1.9.1
+[173] htmltools_0.5.8.1 enrichplot_1.24.4
+[175] GO.db_3.19.1 sctransform_0.4.1
+[177] rappdirs_0.3.3 glue_1.8.0
+[179] spam_2.11-0 httr2_1.0.1
+[181] XVector_0.44.0 RCurl_1.98-1.14
+[183] treeio_1.28.0 gridExtra_2.3
+[185] AUCell_1.26.0 igraph_2.1.1
+[187] R6_2.5.1 tidyr_1.3.1
+[189] labeling_0.4.3 ggpointdensity_0.1.0
+[191] cluster_2.1.6 Rhdf5lib_1.26.0
+[193] aplot_0.2.3 DelayedArray_0.30.1
+[195] tidyselect_1.2.1 htmlTable_2.4.2
+[197] ggforce_0.4.2 car_3.1-3
+[199] future_1.34.0 rsvd_1.0.5
+[201] munsell_0.5.1 KernSmooth_2.23-24
+[203] data.table_1.16.2 htmlwidgets_1.6.4
+[205] fgsea_1.30.0 RColorBrewer_1.1-3
+[207] rlang_1.1.4 spatstat.sparse_3.1-0
+[209] spatstat.explore_3.3-2 fansi_1.0.6
+