diff --git a/2024/evaiin1/SingleCell/DEA_GSEA/DEA.metadata.json b/2024/evaiin1/SingleCell/DEA_GSEA/DEA.metadata.json new file mode 100644 index 0000000..5871cfe --- /dev/null +++ b/2024/evaiin1/SingleCell/DEA_GSEA/DEA.metadata.json @@ -0,0 +1,43 @@ +{ + "name": "SingleCell_DEA.Rmd", + "abstract": "Learn how to perform DEA (Differential Expression Analysis) based on a scaled, normalized, and filtered Seurat object", + "author": [ + { + "@type": "Person", + "name": "Thibault Dayris", + "email": "thibault.dayris@gustaveroussy.fr" + } + ], + "contributor": [ + { + "@type": "Person", + "name": "Emilie Drouineau", + "email": "emilie.drouineau@cea.fr" + } + ], + "description": "This tutorial let you see all function IO and contains the full verbatim of the EBAII 2023 session", + "audiance": { + "@type": "Audiance", + "audienceType": "beginner command line biologist" + }, + "educationalLevel": "Beginner", + "inLanguage": [ + "en-GB" + ], + "teaches": [ + "The student will be able to load Seurat object", + "The student will be able to select a DEA method", + "The student will be able to select a Seurat count slot", + "The student will be able to compose the command line themselves", + "The student will be albe to draw basic graphs", + "The student will be able to save results as CSV" + ], + "keywords": "Seurat, R, Differential Expression Analysis", + "learningResourceType": [ + "handout" + ], + "license": [ + "https://mit-license.org/" + ], + "dateModified": "2023-10-25T13:43:27,142911245+02:00" +} \ No newline at end of file diff --git a/2024/evaiin1/SingleCell/DEA_GSEA/GSEA.metadata.json b/2024/evaiin1/SingleCell/DEA_GSEA/GSEA.metadata.json new file mode 100644 index 0000000..4b54f7f --- /dev/null +++ b/2024/evaiin1/SingleCell/DEA_GSEA/GSEA.metadata.json @@ -0,0 +1,43 @@ +{ + "name": "SingleCell_GSEA.Rmd", + "abstract": "Learn how to perform GSEA (Gene Set Enrichment Analysis) based on a scaled, normalized, and filtered Seurat object", + "author": [ + { + "@type": "Person", + "name": "Thibault Dayris", + "email": "thibault.dayris@gustaveroussy.fr" + } + ], + "contributor": [ + { + "@type": "Person", + "name": "Emilie Drouineau", + "email": "emilie.drouineau@cea.fr" + } + ], + "description": "This tutorial let you see all function IO and contains the full verbatim of the EBAII 2023 session", + "audiance": { + "@type": "Audiance", + "audienceType": "beginner command line biologist" + }, + "educationalLevel": "Beginner", + "inLanguage": [ + "en-GB" + ], + "teaches": [ + "The student will be able to load Seurat object", + "The student will be able to select a GSEA method", + "The student will be able to select a Seurat count slot", + "The student will be able to compose the command line themselves", + "The student will be albe to draw basic graphs", + "The student will be able to save results as CSV" + ], + "keywords": "Seurat, R, Gene Set Enrichment Analysis", + "learningResourceType": [ + "handout" + ], + "license": [ + "https://mit-license.org/" + ], + "dateModified": "2023-10-25T13:43:27,142911245+02:00" +} \ No newline at end of file diff --git a/2024/evaiin1/SingleCell/DEA_GSEA/Makefile b/2024/evaiin1/SingleCell/DEA_GSEA/Makefile new file mode 100644 index 0000000..52607dd --- /dev/null +++ b/2024/evaiin1/SingleCell/DEA_GSEA/Makefile @@ -0,0 +1,29 @@ + +SHELL=/usr/bin/bash +.ONESHELL: +.SHELLFLAGS := -euic +.DELETE_ON_ERROR: +MAKEFLAGS += --warn-undefined-variables +MAKEFLAGS += --no-builtin-rules + +.PHONY: +all: SingleCell_GSEA.html SingleCell_DEA.html + +SingleCell_GSEA.html: ebaii SingleCell_DEA.html + mamba activate ./ebaii && \ + R -e 'rmarkdown::render("SingleCell_GSEA.Rmd")' + + +SingleCell_DEA.html: ebaii Scaled_Normalized_Seurat_Object.RDS + mamba activate ./ebaii && \ + R -e 'rmarkdown::render("SingleCell_DEA.Rmd")' + +Scaled_Normalized_Seurat_Object.RDS: + wget https://nextcloud.gustaveroussy.fr/s/p8Ab8Be43xFogYN/download/Scaled_Normalized_Seurat_Object.RDS + +ebaii: + mamba env create -p ./ebaii -f environment.yaml + +.PHONY: +clean: + rm -rf ./ebaii *.RDS diff --git a/2024/evaiin1/SingleCell/DEA_GSEA/SingleCell_DEA.Rmd b/2024/evaiin1/SingleCell/DEA_GSEA/SingleCell_DEA.Rmd new file mode 100644 index 0000000..b633255 --- /dev/null +++ b/2024/evaiin1/SingleCell/DEA_GSEA/SingleCell_DEA.Rmd @@ -0,0 +1,966 @@ +--- +title: "
EBAII n1 2024
Differential expression analysis
" +date: "`r Sys.Date()`" +author: + - name: "Thibault DAYRIS" + email: "thibault.dayris@gustaveroussy.fr" + - name: "Bastien JOB" + email: "bastien.job@gustaveroussy.fr" +output: + html_document: # Defautl view + highlight: tango ## Theme for the code chunks + number_sections: true ## Adds number to headers (sections) + theme: flatly ## CSS theme for the HTML page + toc: true ## Adds a table of content + toc_float: ## TOC options + collapsed: true ## By default, the TOC is folded + smooth_scroll: true ## Smooth scroll of the HTML page + self_contained: true ## Includes all plots/images within the HTML + code_download: true ## Adds a button to download the Rmd + code_folding: show + thumbnails: false + lightbox: true + fig_caption: true + gallery: true + use_bookdown: true +always_allow_html: true ## Allow plain HTML code in the Rmd +editor_options: + markdown: + wrap: 88 +--- + + + + +```{css banner, echo = FALSE} +body { + background-image: url('ebaii_banner.png'); + background-repeat: no-repeat; + background-size: 100%; + margin: 10% +} + +div { + background-color: rgba(255, 255, 255, 0.35) /* 35% opaque white */; + padding: 0.25em; +} +``` + + + +```{r setup, include=FALSE, echo=FALSE} +knitr::opts_chunk$set( + echo = TRUE, # Print the code + eval = TRUE, # Do not run command lines + message = FALSE, # Print messages + prompt = FALSE, # Do not display prompt + comment = NA, # No comments on this section + warning = FALSE, # Display warnings + tidy = FALSE, + fig.align = "center", + width = 100 # Number of characters per line +) +hooks = knitr::knit_hooks$get() +hook_foldable = function(type) { + force(type) + function(x, options) { + res = hooks[[type]](x, options) + + if (isFALSE(options[[paste0("fold.", type)]])) return(res) + + paste0( + "
Show ", type, "\n\n", + res, + "\n\n
" + ) + } +} +Hmisc::hidingTOC( + buttonLabel = 'Show TOC', + hidden = TRUE, + tocSide = 'left', + buttonSide='left', + posCollapse = 'margin', + levels = 3 +) +my_seed <- 1337L +``` + + + +```{css chunks, echo = FALSE} +div.beyond pre { background-color: pink; color : black; } +div.beyond pre.r { background-color: lightblue; border: 3px solid blue; } +div.notrun pre { background-color: lightyellow; color : brown; } +div.notrun pre.r { background-color: lightgrey; border: 3px solid black; } +``` + + + + +# Forewords + +I need three teams for this session: +team [`Wilcoxon`](https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test), +team [`Student`](https://en.wikipedia.org/wiki/Student%27s_t-test), +and team [`ROC`](https://en.wikipedia.org/wiki/Receiver_operating_characteristic). + +I will also need your help because I can't make [DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html) +work correctly. But I'm sure, that we will solve my issue: you're in the best +session here at [EBAII](https://github.com/IFB-ElixirFr/EBAII). + +## TLDR: R command lines + +In this presentation, there will be screen captures for you to follow the +lesson. There will also be every single R command lines. +Do not take care of the command lines if you find them too challenging. +Our goal here, is to understand the main mechanism of Differential +Expression Analysis. R is just a tool. + +Below are the libraries we need to perform this whole session: + +```{r load_libraries, eval=TRUE, echo=TRUE} +base::library(package = "BiocParallel") # Optionally multithread some steps +base::library(package = "DT") # Display nice table in HTML +base::library(package = "ggplot2") # Draw graphs and plots +base::library(package = "ggpubr") # Draw nicer graphs +base::library(package = "rstatix") # Base R statistics +base::library(package = "knitr") # Build this presentation +base::library(package = "dplyr") # Handle big tables +base::library(package = "Seurat") # Handle SingleCell analyses +base::library(package = "SeuratObject") # Handle SingleCell objects +base::library(package = "SingleCellExperiment") # Handle SingleCell file formats +base::library(package = "UpSetR") # Nice venn-like graphs +base::library(package = "EnhancedVolcano") # Draw Volcano plot +``` + +First, we load Seurat object: + +```{r load_rds, eval=FALSE, echo=TRUE} +sobj <- base::readRDS( + # Path to the RDS file + file = "/shared/projects/ebaii_sc_teachers/SC_TD/06_Integration/RESULTS/12_TD3A.TDCT_S5_Integrated_12926.3886.RDS" +) +``` + +Then we join layers: + +```{r join_layers_tldr, eval=FALSE, echo=TRUE} +Seurat::Idents(sobj) <- sobj$orig.ident +sobj <- SeuratObject::JoinLayers(sobj) +``` + +Then we perform differential expression analysis: + +```{r run_dea, eval=FALSE, echo=TRUE} + +sobj_de <- Seurat::FindMarkers( + # Object on which to perform DEA + object = sobj, + # Name of factor in condition 1 + ident.1 = "TD3A", + # Name of factor in condition 2 + ident.2 = "TDCT" +) +``` + +And that's all. Our goal is to understand these lines, +being able to write them is a bonus. + +## Purpose of this session + +Up to now, we have: + +1. Identified to which cell each sequenced reads come from +1. Identified to which gene each read come from +1. Identified possible bias in gene expression for each cell +1. Filtered and corrected these bias as well as we can + +We would like to identify the list of genes that characterize differences +between cell cycle phases G1 and S groups. + +At the end of this session you will know: + +1. how to select a differential analysis method +1. how to select the correct normalization (if any?) that must be provided to your differential analysis method +1. How to read differential expression results + + +## Load RDS dataset + +You already have a dataset loaded ? Good. Keep on going with it ! You don't have one ? Use mine: + +```{r load_rds_exec, eval=TRUE, echo=TRUE} +# The function `readRDS` from package `base`. +sobj <- base::readRDS( + # Path to the RDS file + file = "/shared/projects/ebaii_sc_teachers/SC_TD/06_Integration/RESULTS/12_TD3A.TDCT_S5_Integrated_12926.3886.RDS" +) +``` + +The command above, uses the function [`readRDS`](https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/readRDS) +from `base` R package. and git it the path to the [RDS](https://riptutorial.com/r/example/3650/rds-and-rdata--rda--files) +file we built in previous session. + +## + +## Insight + +We are wondering what's in our dataset. Let's have a look, +with the function [`print`](https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/print) +form the package `base`. + +```{r print_seurat, eval=TRUE, echo=TRUE} +base::print( + # Object to display + x = sobj +) +``` + +We have `r base::dim(SeuratObject::GetAssayData(object = sobj, assay = "RNA", layer = "data.TD3A"))[1]` features (_aka_ genes), +across `r base::dim(SeuratObject::GetAssayData(object = sobj, assay = "RNA", layer = "data.TD3A"))[2]` samples (_aka_ cells) in the TD3A condition. We have `r base::dim(SeuratObject::GetAssayData(object = sobj, assay = "RNA", layer = "data.TDCT"))[1]` features (_aka_ genes), +across `r base::dim(SeuratObject::GetAssayData(object = sobj, assay = "RNA", layer = "data.TDCT"))[2]` samples (_aka_ cells) in the TD3A condition. + + +Let us have a look at the RNA counts for 10 cells and their annotation, +with the function [`head`](https://www.rdocumentation.org/packages/utils/versions/3.6.2/topics/head) from the package `utils`. + +```{r head_seurat_counts_phase, eval=FALSE, echo=TRUE} +utils::head( + # Object to visualize + x = sobj, + # Number of lines to display + n = 10 +) +``` + + +```{r head_seurat_counts_phase_dt, eval=TRUE, echo=FALSE} +tmp <- utils::head(x = sobj, n = 10) +DT::datatable(data = tmp) +``` + +There is no counts, normalized or not. Where are they ? + +In order to explore the content of `sobj`, use the function `str` from the package `utils`: + +```{r str_seurat_object, eval=FALSE, echo=TRUE} +utils::str( + # Object to explore + object = sobj@assays +) +``` + +Alternatively, in RStudio, you can click on the object pane and explore manually +the content of the object. If we explore the slot `assays`, then we find the counts. + +You can access them with: + +```{r head_seurat_count_table, eval=FALSE, echo=TRUE} +utils::head( + # Object to visualize + x = SeuratObject::GetAssayData(object = sobj, assay = "RNA", layer = "data.TD3A"), + # Number of rows to display + n = 10 +) +``` + +```{r head_seurat_count_table_dt, eval=TRUE, echo=FALSE} +tmp <- utils::head( + x = SeuratObject::GetAssayData(object = sobj, assay = "RNA", layer = "data.TD3A"), + 10 +) +DT::datatable( + data = base::as.data.frame(tmp)[, 1:5] +) +``` + +We have one gene per line, one cell per column, and RNA counts in each row. + + +For the sake of this session, we won't compare the whole dataset. It would take +up to 15 minutes to complete. During the rest of this session, we will compare +clusters 8 and 10, as annotated with Harmony. + +We need to re-annotate layers to do so. This is done in two steps: + +1. redefine [`Idents`](https://www.rdocumentation.org/packages/Seurat/versions/3.1.4/topics/Idents) +in order to be able to call cells by their cluster names. +1. [`JoinLayers`](https://www.rdocumentation.org/packages/SeuratObject/versions/5.0.2/topics/JoinLayers) to have a single count table with both TD3A and TDCT cells from clusters 8 and 10 together. + +```{r ident_and_join_layers, eval=TRUE, echo=TRUE} +Seurat::Idents(sobj) <- sobj$HarmonyStandalone_clusters +sobj <- SeuratObject::JoinLayers(sobj) +``` + +Let's check if our cells are joint: + +```{r check_joint_layers} +base::print(sobj) +``` + + +We have one gene per line, one cell per column, and RNA counts in each row. +Are these counts normalized ? Are they scaled ? Are they filtered ? Are +they corrected ? + +
+ +Answer + +These counts are normalized, scaled, filtered. This information is +available in the seurat object itself, within the slot `commands`. See an +example below: + +```{r seurat_history, eval=TRUE, echo=TRUE} +names(sobj@commands) +``` + +**However**, please be aware that counts in the slot `count` are raw counts. +Normalized counts are in the slot `data` and scaled data are in the slot +`scaled.data`. And it you do not find that clear, I totally agree with you. + +
+
+ + +Is it normal that we have so many zeros ? And what about surch low counts, +is it because we downsampled the sequenced reads for this session ? + +
+ +Answer + +The large number of null counts is completely normal. In maths/stats +we talk about _matrix sparcity_, _i.e_ a table with lots of zeros. If the +data were to be downsampled, we had had done this by cropping over a small +chromosome, and not reducing the general amount of reads. + +
+
+ +# Select a DE method + +## Available methods + +[Seurat](https://satijalab.org/seurat/articles/de_vignette) let us use multiple +differential analysis methods with its function [`FindMarkers`](https://satijalab.org/seurat/reference/findmarkers). + +1. [wilcox](https://www.rdocumentation.org/packages/rstatix/versions/0.7.1): The wilcoxon test tests the mean of expression and looks for a difference in these means. +1. [MAST](https://www.bioconductor.org/packages/release/bioc/vignettes/MAST/inst/doc/MAST-Intro.html): This tool has been built for Single Cell. It is based on a statistical model called ["Hurdle Model"](https://en.wikipedia.org/wiki/Hurdle_model), which excells with data that contains lots of zeros (which is our case in Single Cell RNA-Seq: most of the genes are *not* expressed.) +1. [DESeq2](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#recommendations-for-single-cell-analysis): This tool has originally been built for bulk RNA-Seq but now includes specific functions for Single Cell. It performs well when counts are highly variable or when you wand to compare a handful of cells. +1. [t-test](https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/t.test): The t-test uses a comparison of means to assess differential expression probability. +1. [roc](https://en.wikipedia.org/wiki/Receiver_operating_characteristic): An AUC value of 1 means that expression values for this gene alone can perfectly classify the two groupings (i.e. Each of the cells in cells.1 exhibit a higher level than each of the cells in cells.2). An AUC value of 0 also means there is perfect classification, but in the other direction. A value of 0.5 implies that the gene has no predictive power to classify the two groups. + +The main question now is **how to choose the right test**: spoilers, there are no option better than another in all ways. + +From Soneson & Robinson (2018) Nature Methods: + +![de_tools](images/DE_tools.png) + +Here, researchers have designed an artificial dataset where they knew in advance the list of differentially expressed genes. They have used all these algorithms and consigned the results. + +1. DESeq2, Limma seems to have a higher number of false positives (genes called differentially expressed while they were not.) +1. Wilcoxon seems to be better in general +1. Mast performs well in absolute + +ANOVA was not present in this study. + +## Case: Plac8 + +The question is now to guess whether this gene is differnetially expressed or not. + +### Cell observations + +Let's have a look at the gene named ['Plac8'](https://www.genecards.org/cgi-bin/carddisp.pl?gene=PLAC8&keywords=plac8), +involved in positive regulation of cold-induced thermogenesis and positive +regulation of transcription by RNA polymerase II. In order to plot its +expression across all cells, we are going to use the function +[`VlnPlot`](https://satijalab.org/seurat/reference/vlnplot) +from `Seurat` package. The input object is obviously contained in the `sobj` +variable, since it is our only Seurat object. In addition, we are going to +select the feature `Plac8`, and split the graph according to the clusters +we annotated earlier. + +```{r seurat_vlnplot_Plac8_demo, eval=TRUE, echo=TRUE} +Seurat::VlnPlot( + # A subset of the Seurat object + # limited to clusters 8 and 10, + # or else we will plot all the clusters + object = subset(sobj, HarmonyStandalone_clusters %in% c("8", "10")), + # The name of the gene of interest (feature = gene) + features = "Plac8", + # The name of the Seurat cell annotation + split.by = "HarmonyStandalone_clusters", + # Change color for presentation + cols = c("darkslategray3", "olivedrab") +) +``` + +Using your _'intuition'_, is this gene differentially expressed between cluster +8 and cluster 10 ? + +
+ +Answer + +In cluster 10, the violin plot highlights almost no cells with low or zero `Plac8` +expression. The highest density of cells has a `Plac8` normalized expression +aroung 1.5. + +In Cluster 8, cells seems to have no expression of `Plac8` at all. + +IMHO, and you can disagree, the expression of the gene `Plac8` differs between +cluster 8 and cluster 10. This is purely intuitive. + +
+ +Using your _'intuition'_, is this gene differentially expressed between G1 and S phases ? + + +
+
+ +Answer + +```{r vlnplot_seurat_group_phase_code, echo=TRUE, eval=TRUE} +Seurat::VlnPlot( + # The Seurat object + object = sobj, + # The name of the gene of interest (feature = gene) + features = "Plac8", + # The name of the Seurat cell-cycle annotation + group.by = "CC_Seurat_Phase", + # Change color for presentation + cols = c("darkslategray3", "olivedrab", "orangered3") +) +``` + +Most of the expression is null, some cells express the gene. + +
+
+ +Okay, let's have some informations about these distributions. + +```{r general_count_table, eval=TRUE, echo=TRUE} +# Store counts in a variable +counts <- base::as.data.frame( + # The matrix to reformat into a dataframe + x = SeuratObject::GetAssayData(object = sobj, assay = "RNA", layer = "data") +) +# Rename cells with cell harmony cluster +base::colnames(counts) <- paste( + # The names of the cell cluster for each cell + sobj$CC_Seurat_Phase, + # The names of the cells themselves + colnames(sobj), + sep = "_" +) +``` + +```{r general_count_table_display, eval=TRUE, echo=FALSE} +DT::datatable(head(counts)) +``` + +
+We have `r length(colnames(sobj[, sobj$CC_Seurat_Phase == "G1"]))` cells within the G1 group: + +```{r Plac8_summaries_G1, eval=TRUE} +countsG1 <- select(counts, matches("^G1.")) + +Plac8G1 <- base::as.data.frame(base::t(countsG1["Plac8", ])) + +base::summary(Plac8G1) +``` +
+
+We have `r length(colnames(sobj[, sobj$CC_Seurat_Phase == "S"]))` cells withing the S group: + +```{r Plac8_summaries_S, eval=TRUE} +countsS <- select(counts, matches("^S.")) + +Plac8S <- base::as.data.frame(base::t(countsS["Plac8", ])) + +base::summary(Plac8S) +``` +
+ +### From biology to statistics + +Okay, let us resort on statistics to evaluate our chances to be guess correctly, or our risks to guess wrong. + +We have lots of observations: `r base::length(base::colnames(sobj[, sobj$CC_Seurat_Phase == "G1"]))` +cells within the G1 phase, and `r base::length(base::colnames(sobj[, sobj$CC_Seurat_Phase == "S"]))` +cells withing the S phase.Statisticians really like to have a lot of +observations! Ideally, statisticians always want to have more observation +than tests. We have a total of `r base::length(base::colnames(sobj[, sobj$CC_Seurat_Phase == "G1"])) + base::length(base::colnames(sobj[, sobj$CC_Seurat_Phase == "S"]))` +observations and we are testing 1 gene. For them, this is a dream come true! + +Are our cells supposed to be interacting with each others ? +Are they independent from each others ? +This is very important, and usually, it requires a discussion. + +oes the expression in our cells follow a normal distribution ? +It's easy to check. Let's draw the expression like we did above. + +First, we use the function [`rbind`](https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/cbind) +from `base` package. Be careful, the function `rbind` also exists +in `DelayedArray`, `data.table`, and `BiocGenerics` packages. +We want to use the basic one. Here is an example of odd behaviors +that occurs when not using the package name before a function call. + +```{r distribution_Plac8_table_build, eval=TRUE, echo=TRUE} +# Add column idientifiers in each count dataframes +Plac8G1$phase <- "G1" +Plac8S$phase <- "S" +# Paste the rows beneith each other +Plac8 <- base::rbind( + # variable pointing to G1 counts + Plac8G1, + # variable pointing to S counts + Plac8S, + # A Boolean, requesting that strings/characters + # should not be casted as `logical`. It breaks graphs. + stringsAsFactors = FALSE +) +``` + +Secondly, we use the function [`gghistogram`](https://www.rdocumentation.org/packages/ggpubr/versions/0.6.0/topics/gghistogram) +from package `ggpubr` in order to display relative abundance of gene expression: + +```{r distribution_Plac8_display, eval=TRUE, echo=TRUE} +ggpubr::gghistogram( + Plac8, + x = "Plac8", + y = '..density..', + fill = "steelblue", + bins = 15, + add_density = TRUE +) +``` + + +Our distribution doesn't seems to be normal, nor binomial we will have to rely on non-parametric tests. + +Let's run a non-parametric test base on the mean of distributions, +since it's the clothest to our 'intuitive' approach. Let there be +[Wilcoxon](https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test) test. + +In R, it's quite straightforward: we have the function +[`wilcoxon_test`](https://www.rdocumentation.org/packages/rstatix/versions/0.7.1) +to perform the test, then we can plot the result. + +```{r wilcoxon_Plac8, eval = TRUE, echo=TRUE} +# On the expression table stored in the varialbe `Plac8`, +# first apply the function `wilcox_test` from package `rstatix`, +# then we apply the function `add_significance` from package `rstatix` +stat.test <- Plac8 %>% rstatix::wilcox_test(Plac8 ~ phase) %>% rstatix::add_significance() +# While doing so, we usually also compute effect size +# eff.size <- Plac8 %>% rstatix::wilcox_effsize(Plac8 ~ phase) +``` + +```{r display_wilcoxon_Plac8_result, echo = FALSE, eval = TRUE} +DT::datatable(stat.test, caption = "Wilcoxon test result") +stat.test <- Plac8 %>% rstatix::t_test(Plac8 ~ phase) %>% rstatix::add_significance() +``` + +Wilcoxon test says: the distributions are different, with a `r stat.test$p * 100` +% of risks of being wrong. The gene Plac8 can safely be said differentially +expressed. We can compute a fold change and conclude. + +Just out of curiosity, be aware that [`t_test`](https://www.rdocumentation.org/packages/rstatix/versions/0.7.2/topics/t_test) +from `rstatix` package, gives the same answer. However, +[`DESeq`](https://satijalab.org/seurat/reference/findmarkers) gives a p-value +of 0.05 and an adjusted p-value equal to 1. + +Depending on the test used, this gene is, or is not differentially expressed. + + +## Conclusion + +In conclusion, to select your method, use the following: + +1. If you have already done a study with one of these methods, keep using the same. This is crutial if you ever want to compare your new result with the old ones. +1. If you want to compare your study with a published one, use the same method. +1. If you have no idea, use Wilcoxon. +1. If you have bulk data analyzed with DESeq2/Limma, use DESeq2/Limma. It will be easier to take both works in consideration. + +**Please, never use a simple Wilcoxon on bulk RNA-Seq data.** + +# Select a dataset + +## Dataset depends on selected method + +There it is quite easier: + +```{r choose_counts, eval=TRUE, results='asis', echo=FALSE} +choose_counts <- as.data.frame(t(data.frame( + wilcox = c("Normalized counts", "sojb@assays[['RNA']]@data"), + t_test = c("Normalized counts", "sojb@assays[['RNA']]@data"), + ROC = c("Normalized counts", "sojb@assays[['RNA']]@data"), + ANOVA = c("Normalized counts", "sojb@assays[['RNA']]@data"), + MAST = c("Raw counts", "sojb@assays[['RNA']]@counts"), + DESeq2 = c("Raw counts", "sojb@assays[['RNA']]@counts"), + Limma = c("Raw counts", "sojb@assays[['RNA']]@counts") +))) +colnames(choose_counts) <- c("Counts", "slot name") +DT::datatable(choose_counts, caption = "How to select your counts") +``` + +## FindMarkers + +With the function [`FindMarkers`](https://satijalab.org/seurat/reference/findmarkers) +from package `Seurat`, we want to make three groups: + +1. One using `wilcoxon` to perform DEA between "G1" and "S" phases. +1. One using `t`-test to perform DEA between "G1" and "S" phases. +1. One using `ROC` to perform DEA between "G1" and "S" phases. + +We will observe the results and compare our gene lists. + +Hey, why are you looking at me? It's your turn to work! Use the all the +notions seen above to select the right counts (`slot`), the right input +object, and the right arguments. + +10 minutes practice ! + +
+ +Answers + +Here are the code for each team: + +```{r findmarkers_all_de, echo=TRUE, eval=TRUE} +sobj_wilcox <- Seurat::FindMarkers( + # The variable that contains Seurat Object + object = sobj, + # Name of condition 1 + ident.1 = "8", + # Name of condition 2 + ident.2 = "10", + # Factor name in the Seurat Object + group.by = "HarmonyStandalone_clusters", + # Differential analysis method + test.use = "wilcox" +) + +sobj_t <- Seurat::FindMarkers( + # The variable that contains Seurat Object + object = sobj, + # Name of condition 1 + ident.1 = "8", + # Name of condition 2 + ident.2 = "10", + # Factor name in the Seurat Object + group.by = "HarmonyStandalone_clusters", + # Differential analysis method + test.use = "t" +) + +sobj_roc <- Seurat::FindMarkers( + # The variable that contains Seurat Object + object = sobj, + # Name of condition 1 + ident.1 = "8", + # Name of condition 2 + ident.2 = "10", + # Factor name in the Seurat Object + group.by = "HarmonyStandalone_clusters", + # Differential analysis method + test.use = "roc" +) +``` + +```{r load_dea, eval=TRUE, echo=FALSE} +base::saveRDS(file="sobj_wilcox.RDS", object=sobj_wilcox) +``` + +
+
+ +In the function argument, there is a FoldChange threshold. Should we +filter gene expression based on FoldChange? In case of positive answer, +how much should that threshold be? + +
+ +Answer + + +About thresholds on FDR (False Discovery Rate) and Log2(FC) (Log of the Fold Change), there are many discussions. + +Remember, here the threshold on Fold Change is Logged. A `log2(1) = ``r log2(1)`. And keep in mind the following: + +1. If one selects a fold change threshold above 1.7, then their study concludes that smoking is not related to lung cancer. +1. If one selects a fold change threshold above 1, then their study concludes that a fast-food based diet does not lead to weight gain. +1. If one selects a fold change threshold above 1/25 000 000, then their study concludes: it is a complete hazard that mice have featal malformation when in presence of Bisphanol. + +In conclusion, there one, and only one reason to filter on fold change: in my experience, a fold change below 0.7 will be hard to see/verify on wet-lab (qRT). + +If you need to reduce a too large number of differentially expressed genes, then reduce the FDR to 0.01, or even better, to 0.001. With that, you reduce your number of false claims. + +
+
+ +Can you help me with `DEseq2`? + +When I run the following command line, I have an error : + +```{r seurat_run_deseq_with_error, eval=FALSE, echo=TRUE} +sobj_deseq2 <- Seurat::FindMarkers( + # The variable that contains Seurat Object + object = sobj, + # Name of condition 1 + ident.1 = 8, + # Name of condition 2 + ident.2 = 10, + # Factor name in the Seurat Object + group.by = "HarmonyStandalone_clusters", + # Differential analysis method + test.use = "deseq2", + # Use non-normalized data with DESeq2 + slot = "counts" +) +``` + +> Error in PerformDE(object = object, cells.1 = cells.1, cells.2 = cells.2, : +> Unknown test: deseq2 + + +
+ +Answer + +Oh! My fault, it was a typo in my command! Thank you all for your help! + +```{r build_count_p1_matrix, eval=TRUE, echo=TRUE} +sobj_deseq2 <- Seurat::FindMarkers( + # The variable that contains Seurat Object + object = sobj, + # Name of condition 1 + ident.1 = "8", + # Name of condition 2 + ident.2 = "10", + # Factor name in the Seurat Object + group.by = "HarmonyStandalone_clusters", + # Differential analysis method + test.use = "DESeq2", + # Use non-normalized data with DESeq2 + slot = "counts" +) +``` + +Remark: by doing surch modification, some fold changes have been modified: +remember the gene Atad2 with a mean expression of 0.08 in G1, and 0.2 in S +phases? Mean expressions are now around 1.08 for G1, and 1.2 for S phases. +This may be the reason why it was not differentially expressed in DESeq2, +while Wilcoxon and T-test returned adjusted pvalues far below 0.05. + +
+ + + +```{r save_de_results, eval=TRUE, echo=FALSE} +base::saveRDS(sobj_wilcox, "sobj_wilcox.RDS") +base::saveRDS(sobj_t, "sobj_t.RDS") +base::saveRDS(sobj_roc, "sobj_roc.RDS") +base::saveRDS(sobj_deseq2, "sobj_deseq2.RDS") +``` + +# Explore results + +## Big tables + +Let us have a look at the results: + +```{r sobj_w_res_display, eval=TRUE, echo=FALSE} +DT::datatable( + head(sobj_wilcox, n = 10), + caption = "Wilcoxon test results" +) +``` + +We have the following columns: + +1. `p_val`: Ignore this column. Always ignore raw p-values. Look at corrected ones, and if they are missing, then compute them. +1. `avg_log2FC`: Average Log2(FoldChange). Illustrates how much a gene is differentially expessed between samples in each condition. +1. `pct.1`: Percent of cells with gene expression in condition one, here in "G1" phase. +1. `pct.2`: Percent of cells with gene expression in condition two, here in "S" phase. +1. `p_val_adj`: The confidence we have in the result. The closer to 0, the lesser is the risk of error. + + +```{r sobj_t_res_display, eval=TRUE, echo=FALSE} +DT::datatable( + head(sobj_t, n = 10), + caption = "T-test results" +) +``` + +We have the following columns: + +1. `p_val`: Ignore this column. Always ignore raw p-values. Look at corrected ones, and if they are missing, then compute them. +1. `avg_log2FC`: Average Log2(FoldChange). Illustrates how much a gene is differentially expessed between samples in each condition. +1. `pct.1`: Percent of cells with gene expression in condition one, here in "G1" phase. +1. `pct.2`: Percent of cells with gene expression in condition two, here in "S" phase. +1. `p_val_adj`: The confidence we have in the result. The closer to 0, the lesser is the risk of error. + +```{r sobj_roc_res_display, eval=TRUE, echo=FALSE} +DT::datatable( + head(sobj_roc, n = 10), + caption = "ROC test results" +) +``` + +We have the following columns: + +1. `myAUC`: The area under the curve +1. `avg_diff`: Average difference in AUC +1. `power`: `abs(AUC-0.5) * 2`, usefull to sort genes based on AUC +1. `pct.1`: Percent of cells with gene expression in condition one, here in "G1" phase. +1. `pct.2`: Percent of cells with gene expression in condition two, here in "S" phase. + + +```{r sobj_deseq_res_display, eval=TRUE, echo=FALSE} +DT::datatable( + head(sobj_t, n = 10), + caption = "T-test results" +) +``` + +We have the following columns: + +1. `p_val`: Ignore this column. Always ignore raw p-values. Look at corrected ones, and if they are missing, then compute them. +1. `avg_log2FC`: Average Log2(FoldChange). Illustrates how much a gene is differentially expessed between samples in each condition. +1. `pct.1`: Percent of cells with gene expression in condition one, here in "G1" phase. +1. `pct.2`: Percent of cells with gene expression in condition two, here in "S" phase. +1. `p_val_adj`: The confidence we have in the result. The closer to 0, the lesser is the risk of error. + + +## Filter DEA results + +What kind of threshold should be used to filter each results? + +```{r extract_de_genes, eval=TRUE, echo=FALSE} +# We store a `list` in a variable called `data` +# The function `list` comes from `base` and not `biocGenerics`. +data <- base::list( + # We use a threshold of 5% on adjusted p-values + wilcox = base::rownames(sobj_wilcox[sobj_wilcox$p_val_adj <= 0.05, ]), + # We use a threshold of 5% on adjusted p-values + t_test = base::rownames(sobj_t[sobj_t$p_val_adj <= 0.05, ]), + # We use a threshold of 0.2 in AUC power + roc = base::rownames(sobj_roc[sobj_roc$power >= 0.2, ]), + # We use a threshold of 5% on adjusted p-values + deseq2 = base::rownames(sobj_deseq2[sobj_deseq2$p_val_adj <= 0.05, ]) +) +``` + +> If we must label certain scores as good or bad, we can reference the +following rule of thumb: +> +> 0.5 = No discrimination +> 0.5-0.7 = Poor discrimination +> 0.7-0.8 = Acceptable discrimination +> 0.8-0.9= Excellent discrimination +> 0.9 = Outstanding discrimination + +Hosmer and Lemeshow in Applied Logistic Regression (p. 177) + +## Add results to Seurat objects + +We'd like to store the results of differential expression analysis in +the `Seurat` object. + +```{r add_results_seurat, echo=TRUE, eval=TRUE} +sobj@misc$wilcox <- sobj_wilcox +``` + +```{r save_sobjw, eval=TRUE, echo=FALSE} +base::saveRDS(sobj, "DEA_Scaled_Normalized_Filtered.RDS") +``` + +## Common results + +Now we can plot intersections in an up-set graph. It is like a venn diagramm: + +```{r upset_seurat_de_methods, eval=TRUE, echo=TRUE} +UpSetR::upset( + data = UpSetR::fromList(data), + order.by = "freq" +) +``` + + + +## Heatmap + +We'd like to display the expression of genes identified by FindMarkers. +Then we use the function [`DoHeatmap`](https://satijalab.org/seurat/reference/doheatmap) +from the package `Seurat`. + +In order to limit the graph to differentially expressed reads, we use the +function [`rownames`](https://rdocumentation.org/packages/base/versions/3.6.2/topics/row.names) +from R `base` package on the DEA result table. In this example, I use +the results of wilcoxon, but you shall use any of the results you previously +obtained. + +```{r seurat_heatmap, eval=TRUE, echo=TRUE} +Seurat::DoHeatmap( + # variable pointing to seurat object + object = sobj, + # name of DE genes + features = base::rownames(sobj_wilcox), + # Cluster annotation + group.by = "HarmonyStandalone_clusters", +) +``` + +## Volcano plot + +A Volcano plot is usefull to identify differnetial expression +analysis bias. + +The package `EnhancedVolcano` has an [eponym](https://bioconductor.org/packages/3.17/bioc/html/EnhancedVolcano.html) +function for that: + +```{r enhanced_volcanoplot, eval=TRUE, echo=TRUE} +EnhancedVolcano::EnhancedVolcano( + # variable pointing to the DEA results + toptable = sobj_wilcox, + # Gene names + lab = rownames(sobj_wilcox), + # Column in which to find Fold Change + x = "avg_log2FC", + # Column in which to find confidence interval + y = "p_val_adj", + # Lower fold-change cut-off + FCcutoff = 0.2 +) +``` + +## Session Info + +This list of all packages used while you work should be included +in each en every R presentation: + +```{r session_info, eval=TRUE, echo=TRUE} +utils::sessionInfo() +``` \ No newline at end of file diff --git a/2024/evaiin1/SingleCell/DEA_GSEA/SingleCell_DEA.html b/2024/evaiin1/SingleCell/DEA_GSEA/SingleCell_DEA.html new file mode 100644 index 0000000..976190a --- /dev/null +++ b/2024/evaiin1/SingleCell/DEA_GSEA/SingleCell_DEA.html @@ -0,0 +1,5155 @@ + + + + + + + + + + + + + + + ¶ EBAII n1 2024Differential expression analysis ¶ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+
+
+
+
+ +
+ + + + + + + + + + + + + +
+

1 Forewords

+

I need three teams for this session: team Wilcoxon, +team Student, +and team ROC.

+

I will also need your help because I can’t make DESeq2 +work correctly. But I’m sure, that we will solve my issue: you’re in the +best session here at EBAII.

+
+

1.1 TLDR: R command +lines

+

In this presentation, there will be screen captures for you to follow +the lesson. There will also be every single R command lines. Do not take +care of the command lines if you find them too challenging. Our goal +here, is to understand the main mechanism of Differential Expression +Analysis. R is just a tool.

+

Below are the libraries we need to perform this whole session:

+
base::library(package = "BiocParallel")    # Optionally multithread some steps
+base::library(package = "DT")              # Display nice table in HTML
+base::library(package = "ggplot2")         # Draw graphs and plots
+base::library(package = "ggpubr")          # Draw nicer graphs
+base::library(package = "rstatix")         # Base R statistics
+base::library(package = "knitr")           # Build this presentation
+base::library(package = "dplyr")           # Handle big tables
+base::library(package = "Seurat")          # Handle SingleCell analyses
+base::library(package = "SeuratObject")    # Handle SingleCell objects
+base::library(package = "SingleCellExperiment") # Handle SingleCell file formats
+base::library(package = "UpSetR")          # Nice venn-like graphs
+base::library(package = "EnhancedVolcano") # Draw Volcano plot
+

First, we load Seurat object:

+
sobj <- base::readRDS(
+  # Path to the RDS file
+  file = "/shared/projects/ebaii_sc_teachers/SC_TD/06_Integration/RESULTS/12_TD3A.TDCT_S5_Integrated_12926.3886.RDS"
+)
+

Then we join layers:

+
Seurat::Idents(sobj) <- sobj$orig.ident
+sobj <- SeuratObject::JoinLayers(sobj)
+

Then we perform differential expression analysis:

+
sobj_de <- Seurat::FindMarkers(
+  # Object on which to perform DEA
+  object = sobj,
+  # Name of factor in condition 1
+  ident.1 = "TD3A",
+  # Name of factor in condition 2
+  ident.2 = "TDCT"
+)
+

And that’s all. Our goal is to understand these lines, being able to +write them is a bonus.

+
+
+

1.2 Purpose of this +session

+

Up to now, we have:

+
    +
  1. Identified to which cell each sequenced reads come from
  2. +
  3. Identified to which gene each read come from
  4. +
  5. Identified possible bias in gene expression for each cell
  6. +
  7. Filtered and corrected these bias as well as we can
  8. +
+

We would like to identify the list of genes that characterize +differences between cell cycle phases G1 and S groups.

+

At the end of this session you will know:

+
    +
  1. how to select a differential analysis method
  2. +
  3. how to select the correct normalization (if any?) that must be +provided to your differential analysis method
  4. +
  5. How to read differential expression results
  6. +
+
+
+

1.3 Load RDS dataset

+

You already have a dataset loaded ? Good. Keep on going with it ! You +don’t have one ? Use mine:

+
# The function `readRDS` from package `base`.
+sobj <- base::readRDS(
+  # Path to the RDS file
+  file = "/shared/projects/ebaii_sc_teachers/SC_TD/06_Integration/RESULTS/12_TD3A.TDCT_S5_Integrated_12926.3886.RDS"
+)
+

The command above, uses the function readRDS +from base R package. and git it the path to the RDS +file we built in previous session.

+
+
+

1.4

+
+
+

1.5 Insight

+

We are wondering what’s in our dataset. Let’s have a look, with the +function print +form the package base.

+
base::print(
+  # Object to display
+  x = sobj
+)
+
An object of class Seurat 
+12926 features across 3886 samples within 1 assay 
+Active assay: RNA (12926 features, 2000 variable features)
+ 5 layers present: counts.TD3A, counts.TDCT, data.TD3A, data.TDCT, scale.data
+ 6 dimensional reductions calculated: pca, CCAIntegration, umap, RPCAIntegration, HarmonyIntegration, HarmonyStandalone
+

We have 12508 features (aka genes), across 2018 samples +(aka cells) in the TD3A condition. We have 12254 features +(aka genes), across 1868 samples (aka cells) in the +TD3A condition.

+

Let us have a look at the RNA counts for 10 cells and their +annotation, with the function head +from the package utils.

+
utils::head(
+  # Object to visualize
+  x = sobj,
+  # Number of lines to display
+  n = 10
+)
+
+ +

There is no counts, normalized or not. Where are they ?

+

In order to explore the content of sobj, use the +function str from the package utils:

+
utils::str(
+  # Object to explore
+  object = sobj@assays
+)
+

Alternatively, in RStudio, you can click on the object pane and +explore manually the content of the object. If we explore the slot +assays, then we find the counts.

+

You can access them with:

+
utils::head(
+  # Object to visualize
+  x = SeuratObject::GetAssayData(object = sobj, assay = "RNA", layer = "data.TD3A"),
+  # Number of rows to display
+  n = 10
+)
+
+ +

We have one gene per line, one cell per column, and RNA counts in +each row.

+

For the sake of this session, we won’t compare the whole dataset. It +would take up to 15 minutes to complete. During the rest of this +session, we will compare clusters 8 and 10, as annotated with +Harmony.

+

We need to re-annotate layers to do so. This is done in two +steps:

+
    +
  1. redefine Idents +in order to be able to call cells by their cluster names.
  2. +
  3. JoinLayers +to have a single count table with both TD3A and TDCT cells from clusters +8 and 10 together.
  4. +
+
Seurat::Idents(sobj) <- sobj$HarmonyStandalone_clusters
+sobj <- SeuratObject::JoinLayers(sobj)
+

Let’s check if our cells are joint:

+
base::print(sobj)
+
An object of class Seurat 
+12926 features across 3886 samples within 1 assay 
+Active assay: RNA (12926 features, 2000 variable features)
+ 3 layers present: data, counts, scale.data
+ 6 dimensional reductions calculated: pca, CCAIntegration, umap, RPCAIntegration, HarmonyIntegration, HarmonyStandalone
+

We have one gene per line, one cell per column, and RNA counts in +each row. Are these counts normalized ? Are they scaled ? Are they +filtered ? Are they corrected ?

+
+ +Answer + +

These counts are normalized, scaled, filtered. This information is +available in the seurat object itself, within the slot +commands. See an example below:

+
names(sobj@commands)
+
 [1] "NormalizeData.RNA"                   
+ [2] "FindVariableFeatures.RNA"            
+ [3] "ScaleData.RNA"                       
+ [4] "RunPCA.RNA"                          
+ [5] "RunUMAP.RNA.CCAIntegration"          
+ [6] "FindNeighbors.RNA.CCAIntegration"    
+ [7] "RunUMAP.RNA.RPCAIntegration"         
+ [8] "FindNeighbors.RNA.RPCAIntegration"   
+ [9] "RunUMAP.RNA.HarmonyIntegration"      
+[10] "FindNeighbors.RNA.HarmonyIntegration"
+[11] "RunUMAP.RNA.HarmonyStandalone"       
+[12] "FindNeighbors.RNA.HarmonyStandalone" 
+[13] "FindClusters"                        
+

However, please be aware that counts in the slot +count are raw counts. Normalized counts are in the slot +data and scaled data are in the slot +scaled.data. And it you do not find that clear, I totally +agree with you.

+
+


+

Is it normal that we have so many zeros ? And what about surch low +counts, is it because we downsampled the sequenced reads for this +session ?

+
+ +Answer + +

The large number of null counts is completely normal. In maths/stats +we talk about matrix sparcity, i.e a table with lots +of zeros. If the data were to be downsampled, we had had done this by +cropping over a small chromosome, and not reducing the general amount of +reads.

+
+
+
+
+
+

2 Select a DE method

+
+

2.1 Available +methods

+

Seurat let +us use multiple differential analysis methods with its function FindMarkers.

+
    +
  1. wilcox: +The wilcoxon test tests the mean of expression and looks for a +difference in these means.
  2. +
  3. MAST: +This tool has been built for Single Cell. It is based on a statistical +model called “Hurdle Model”, +which excells with data that contains lots of zeros (which is our case +in Single Cell RNA-Seq: most of the genes are not +expressed.)
  4. +
  5. DESeq2: +This tool has originally been built for bulk RNA-Seq but now includes +specific functions for Single Cell. It performs well when counts are +highly variable or when you wand to compare a handful of cells.
  6. +
  7. t-test: +The t-test uses a comparison of means to assess differential expression +probability.
  8. +
  9. roc: +An AUC value of 1 means that expression values for this gene alone can +perfectly classify the two groupings (i.e. Each of the cells in cells.1 +exhibit a higher level than each of the cells in cells.2). An AUC value +of 0 also means there is perfect classification, but in the other +direction. A value of 0.5 implies that the gene has no predictive power +to classify the two groups.
  10. +
+

The main question now is how to choose the right +test: spoilers, there are no option better than another in all +ways.

+

From Soneson & Robinson (2018) Nature Methods:

+
+ +

de_tools

+
+

Here, researchers have designed an artificial dataset where they knew +in advance the list of differentially expressed genes. They have used +all these algorithms and consigned the results.

+
    +
  1. DESeq2, Limma seems to have a higher number of false positives +(genes called differentially expressed while they were not.)
  2. +
  3. Wilcoxon seems to be better in general
  4. +
  5. Mast performs well in absolute
  6. +
+

ANOVA was not present in this study.

+
+
+

2.2 Case: Plac8

+

The question is now to guess whether this gene is differnetially +expressed or not.

+
+

2.2.1 Cell +observations

+

Let’s have a look at the gene named ‘Plac8’, +involved in positive regulation of cold-induced thermogenesis and +positive regulation of transcription by RNA polymerase II. In order to +plot its expression across all cells, we are going to use the function +VlnPlot +from Seurat package. The input object is obviously +contained in the sobj variable, since it is our only Seurat +object. In addition, we are going to select the feature +Plac8, and split the graph according to the clusters we +annotated earlier.

+
Seurat::VlnPlot(
+  # A subset of the Seurat object
+  # limited to clusters 8 and 10, 
+  # or else we will plot all the clusters
+  object = subset(sobj, HarmonyStandalone_clusters %in% c("8", "10")),
+  # The name of the gene of interest (feature = gene)
+  features = "Plac8",
+  # The name of the Seurat cell annotation
+  split.by = "HarmonyStandalone_clusters",
+  # Change color for presentation
+  cols = c("darkslategray3", "olivedrab")
+)
+

+

Using your ‘intuition’, is this gene differentially +expressed between cluster 8 and cluster 10 ?

+
+ +Answer + +

In cluster 10, the violin plot highlights almost no cells with low or +zero Plac8 expression. The highest density of cells has a +Plac8 normalized expression aroung 1.5.

+

In Cluster 8, cells seems to have no expression of Plac8 +at all.

+

IMHO, and you can disagree, the expression of the gene +Plac8 differs between cluster 8 and cluster 10. This is +purely intuitive.

+
+

Using your ‘intuition’, is this gene differentially +expressed between G1 and S phases ?

+
+
+ +Answer + +
Seurat::VlnPlot(
+  # The Seurat object
+  object = sobj,
+  # The name of the gene of interest (feature = gene)
+  features = "Plac8",
+  # The name of the Seurat cell-cycle annotation
+  group.by = "CC_Seurat_Phase",
+  # Change color for presentation
+  cols = c("darkslategray3", "olivedrab", "orangered3")
+)
+

+

Most of the expression is null, some cells express the gene.

+
+
+

Okay, let’s have some informations about these distributions.

+
# Store counts in a variable
+counts <- base::as.data.frame(
+  # The matrix to reformat into a dataframe
+  x = SeuratObject::GetAssayData(object = sobj, assay = "RNA", layer = "data")
+)
+# Rename cells with cell harmony cluster
+base::colnames(counts) <- paste(
+  # The names of the cell cluster for each cell
+  sobj$CC_Seurat_Phase,
+  # The names of the cells themselves
+  colnames(sobj),
+  sep = "_"
+)
+
+ +
+

We have 2838 cells within the G1 group:

+
countsG1 <- select(counts, matches("^G1."))
+
+Plac8G1 <- base::as.data.frame(base::t(countsG1["Plac8", ]))
+
+base::summary(Plac8G1)
+
     Plac8        
+ Min.   :0.00000  
+ 1st Qu.:0.00000  
+ Median :0.00000  
+ Mean   :0.04823  
+ 3rd Qu.:0.00000  
+ Max.   :3.48145  
+
+
+

We have 711 cells withing the S group:

+
countsS <- select(counts, matches("^S."))
+
+Plac8S <- base::as.data.frame(base::t(countsS["Plac8", ]))
+
+base::summary(Plac8S)
+
     Plac8       
+ Min.   :0.0000  
+ 1st Qu.:0.0000  
+ Median :0.0000  
+ Mean   :0.2097  
+ 3rd Qu.:0.0000  
+ Max.   :3.5292  
+
+
+
+

2.2.2 From biology to +statistics

+

Okay, let us resort on statistics to evaluate our chances to be guess +correctly, or our risks to guess wrong.

+

We have lots of observations: 2838 cells within the G1 phase, and 711 +cells withing the S phase.Statisticians really like to have a lot of +observations! Ideally, statisticians always want to have more +observation than tests. We have a total of 3549 observations and we are +testing 1 gene. For them, this is a dream come true!

+

Are our cells supposed to be interacting with each others ? Are they +independent from each others ? This is very important, and usually, it +requires a discussion.

+

oes the expression in our cells follow a normal distribution ? It’s +easy to check. Let’s draw the expression like we did above.

+

First, we use the function rbind +from base package. Be careful, the function +rbind also exists in DelayedArray, +data.table, and BiocGenerics packages. We want +to use the basic one. Here is an example of odd behaviors that occurs +when not using the package name before a function call.

+
# Add column idientifiers in each count dataframes
+Plac8G1$phase <- "G1"
+Plac8S$phase <- "S"
+# Paste the rows beneith each other
+Plac8 <- base::rbind(
+  #  variable pointing to G1 counts
+  Plac8G1,
+  #  variable pointing to S counts
+  Plac8S,
+  # A Boolean, requesting that strings/characters
+  # should not be casted as `logical`. It breaks graphs.
+  stringsAsFactors = FALSE
+)
+

Secondly, we use the function gghistogram +from package ggpubr in order to display relative abundance +of gene expression:

+
ggpubr::gghistogram(
+  Plac8,
+  x = "Plac8",
+  y = '..density..',
+  fill = "steelblue",
+  bins = 15,
+  add_density = TRUE
+)
+

+

Our distribution doesn’t seems to be normal, nor binomial we will +have to rely on non-parametric tests.

+

Let’s run a non-parametric test base on the mean of distributions, +since it’s the clothest to our ‘intuitive’ approach. Let there be Wilcoxon +test.

+

In R, it’s quite straightforward: we have the function wilcoxon_test +to perform the test, then we can plot the result.

+
# On the expression table stored in the varialbe `Plac8`,
+# first apply the function `wilcox_test` from package `rstatix`,
+# then we apply the function `add_significance` from package `rstatix`
+stat.test <- Plac8 %>% rstatix::wilcox_test(Plac8 ~ phase) %>% rstatix::add_significance()
+# While doing so, we usually also compute effect size
+# eff.size <- Plac8 %>% rstatix::wilcox_effsize(Plac8 ~ phase)
+
+ +

Wilcoxon test says: the distributions are different, with a 3^{-13} % +of risks of being wrong. The gene Plac8 can safely be said +differentially expressed. We can compute a fold change and conclude.

+

Just out of curiosity, be aware that t_test +from rstatix package, gives the same answer. However, DESeq +gives a p-value of 0.05 and an adjusted p-value equal to 1.

+

Depending on the test used, this gene is, or is not differentially +expressed.

+
+
+
+

2.3 Conclusion

+

In conclusion, to select your method, use the following:

+
    +
  1. If you have already done a study with one of these methods, keep +using the same. This is crutial if you ever want to compare your new +result with the old ones.
  2. +
  3. If you want to compare your study with a published one, use the same +method.
  4. +
  5. If you have no idea, use Wilcoxon.
  6. +
  7. If you have bulk data analyzed with DESeq2/Limma, use DESeq2/Limma. +It will be easier to take both works in consideration.
  8. +
+

Please, never use a simple Wilcoxon on bulk RNA-Seq +data.

+
+
+
+

3 Select a dataset

+
+

3.1 Dataset depends on +selected method

+

There it is quite easier:

+
+ +
+
+

3.2 FindMarkers

+

With the function FindMarkers +from package Seurat, we want to make three groups:

+
    +
  1. One using wilcoxon to perform DEA between “G1” and “S” +phases.
  2. +
  3. One using t-test to perform DEA between “G1” and “S” +phases.
  4. +
  5. One using ROC to perform DEA between “G1” and “S” +phases.
  6. +
+

We will observe the results and compare our gene lists.

+

Hey, why are you looking at me? It’s your turn to work! Use the all +the notions seen above to select the right counts (slot), +the right input object, and the right arguments.

+

10 minutes practice !

+
+ +Answers + +

Here are the code for each team:

+
sobj_wilcox <- Seurat::FindMarkers(
+  # The variable that contains Seurat Object
+  object = sobj,
+  # Name of condition 1
+  ident.1 = "8",
+  # Name of condition 2
+  ident.2 = "10",
+  # Factor name in the Seurat Object
+  group.by = "HarmonyStandalone_clusters",
+  # Differential analysis method
+  test.use = "wilcox"
+)
+
+sobj_t <- Seurat::FindMarkers(
+  # The variable that contains Seurat Object
+  object = sobj,
+  # Name of condition 1
+  ident.1 = "8",
+  # Name of condition 2
+  ident.2 = "10",
+  # Factor name in the Seurat Object
+  group.by = "HarmonyStandalone_clusters",
+  # Differential analysis method
+  test.use = "t"
+)
+
+sobj_roc <- Seurat::FindMarkers(
+  # The variable that contains Seurat Object
+  object = sobj,
+  # Name of condition 1
+  ident.1 = "8",
+  # Name of condition 2
+  ident.2 = "10",
+  # Factor name in the Seurat Object
+  group.by = "HarmonyStandalone_clusters",
+  # Differential analysis method
+  test.use = "roc"
+)
+
+


+

In the function argument, there is a FoldChange threshold. Should we +filter gene expression based on FoldChange? In case of positive answer, +how much should that threshold be?

+
+ +Answer + +

About thresholds on FDR (False Discovery Rate) and Log2(FC) (Log of +the Fold Change), there are many discussions.

+

Remember, here the threshold on Fold Change is Logged. A +log2(1) =0. And keep in mind the following:

+
    +
  1. If one selects a fold change threshold above 1.7, then their study +concludes that smoking is not related to lung cancer.
  2. +
  3. If one selects a fold change threshold above 1, then their study +concludes that a fast-food based diet does not lead to weight gain.
  4. +
  5. If one selects a fold change threshold above 1/25 000 000, then +their study concludes: it is a complete hazard that mice have featal +malformation when in presence of Bisphanol.
  6. +
+

In conclusion, there one, and only one reason to filter on fold +change: in my experience, a fold change below 0.7 will be hard to +see/verify on wet-lab (qRT).

+

If you need to reduce a too large number of differentially expressed +genes, then reduce the FDR to 0.01, or even better, to 0.001. With that, +you reduce your number of false claims.

+
+


+

Can you help me with DEseq2?

+

When I run the following command line, I have an error :

+
sobj_deseq2 <- Seurat::FindMarkers(
+  # The variable that contains Seurat Object
+  object = sobj,
+  # Name of condition 1
+  ident.1 = 8,
+  # Name of condition 2
+  ident.2 = 10,
+  # Factor name in the Seurat Object
+  group.by = "HarmonyStandalone_clusters",
+  # Differential analysis method
+  test.use = "deseq2",
+  # Use non-normalized data with DESeq2
+  slot = "counts"
+)
+
+

Error in PerformDE(object = object, cells.1 = cells.1, cells.2 = +cells.2, : Unknown test: deseq2

+
+
+ +Answer + +

Oh! My fault, it was a typo in my command! Thank you all for your +help!

+
sobj_deseq2 <- Seurat::FindMarkers(
+  # The variable that contains Seurat Object
+  object = sobj,
+  # Name of condition 1
+  ident.1 = "8",
+  # Name of condition 2
+  ident.2 = "10",
+  # Factor name in the Seurat Object
+  group.by = "HarmonyStandalone_clusters",
+  # Differential analysis method
+  test.use = "DESeq2",
+  # Use non-normalized data with DESeq2
+  slot = "counts"
+)
+

Remark: by doing surch modification, some fold changes have been +modified: remember the gene Atad2 with a mean expression of 0.08 in G1, +and 0.2 in S phases? Mean expressions are now around 1.08 for G1, and +1.2 for S phases. This may be the reason why it was not differentially +expressed in DESeq2, while Wilcoxon and T-test returned adjusted pvalues +far below 0.05.

+
+
+
+
+

4 Explore results

+
+

4.1 Big tables

+

Let us have a look at the results:

+
+ +

We have the following columns:

+
    +
  1. p_val: Ignore this column. Always ignore raw p-values. +Look at corrected ones, and if they are missing, then compute them.
  2. +
  3. avg_log2FC: Average Log2(FoldChange). Illustrates how +much a gene is differentially expessed between samples in each +condition.
  4. +
  5. pct.1: Percent of cells with gene expression in +condition one, here in “G1” phase.
  6. +
  7. pct.2: Percent of cells with gene expression in +condition two, here in “S” phase.
  8. +
  9. p_val_adj: The confidence we have in the result. The +closer to 0, the lesser is the risk of error.
  10. +
+
+ +

We have the following columns:

+
    +
  1. p_val: Ignore this column. Always ignore raw p-values. +Look at corrected ones, and if they are missing, then compute them.
  2. +
  3. avg_log2FC: Average Log2(FoldChange). Illustrates how +much a gene is differentially expessed between samples in each +condition.
  4. +
  5. pct.1: Percent of cells with gene expression in +condition one, here in “G1” phase.
  6. +
  7. pct.2: Percent of cells with gene expression in +condition two, here in “S” phase.
  8. +
  9. p_val_adj: The confidence we have in the result. The +closer to 0, the lesser is the risk of error.
  10. +
+
+ +

We have the following columns:

+
    +
  1. myAUC: The area under the curve
  2. +
  3. avg_diff: Average difference in AUC
  4. +
  5. power: abs(AUC-0.5) * 2, usefull to sort +genes based on AUC
  6. +
  7. pct.1: Percent of cells with gene expression in +condition one, here in “G1” phase.
  8. +
  9. pct.2: Percent of cells with gene expression in +condition two, here in “S” phase.
  10. +
+
+ +

We have the following columns:

+
    +
  1. p_val: Ignore this column. Always ignore raw p-values. +Look at corrected ones, and if they are missing, then compute them.
  2. +
  3. avg_log2FC: Average Log2(FoldChange). Illustrates how +much a gene is differentially expessed between samples in each +condition.
  4. +
  5. pct.1: Percent of cells with gene expression in +condition one, here in “G1” phase.
  6. +
  7. pct.2: Percent of cells with gene expression in +condition two, here in “S” phase.
  8. +
  9. p_val_adj: The confidence we have in the result. The +closer to 0, the lesser is the risk of error.
  10. +
+
+
+

4.2 Filter DEA +results

+

What kind of threshold should be used to filter each results?

+
+

If we must label certain scores as good or bad, we can reference the +following rule of thumb:

+

0.5 = No discrimination 0.5-0.7 = Poor discrimination 0.7-0.8 = +Acceptable discrimination 0.8-0.9= Excellent discrimination 0.9 = +Outstanding discrimination

+
+

Hosmer and Lemeshow in Applied Logistic Regression (p. 177)

+
+
+

4.3 Add results to Seurat +objects

+

We’d like to store the results of differential expression analysis in +the Seurat object.

+
sobj@misc$wilcox <- sobj_wilcox
+
+
+

4.4 Common results

+

Now we can plot intersections in an up-set graph. It is like a venn +diagramm:

+
UpSetR::upset(
+  data = UpSetR::fromList(data),
+  order.by = "freq"
+)
+

+
+
+

4.5 Heatmap

+

We’d like to display the expression of genes identified by +FindMarkers. Then we use the function DoHeatmap +from the package Seurat.

+

In order to limit the graph to differentially expressed reads, we use +the function rownames +from R base package on the DEA result table. In this +example, I use the results of wilcoxon, but you shall use any of the +results you previously obtained.

+
Seurat::DoHeatmap(
+  # variable pointing to seurat object
+  object = sobj,
+  # name of DE genes
+  features = base::rownames(sobj_wilcox),
+  # Cluster annotation
+  group.by = "HarmonyStandalone_clusters",
+)
+

+
+
+

4.6 Volcano plot

+

A Volcano plot is usefull to identify differnetial expression +analysis bias.

+

The package EnhancedVolcano has an eponym +function for that:

+
EnhancedVolcano::EnhancedVolcano(
+  #  variable pointing to the DEA results
+  toptable = sobj_wilcox,
+  # Gene names
+  lab = rownames(sobj_wilcox),
+  # Column in which to find Fold Change
+  x = "avg_log2FC",
+  # Column in which to find confidence interval
+  y = "p_val_adj",
+  # Lower fold-change cut-off
+  FCcutoff = 0.2
+)
+

+
+
+

4.7 Session Info

+

This list of all packages used while you work should be included in +each en every R presentation:

+
utils::sessionInfo()
+
R version 4.4.1 (2024-06-14)
+Platform: x86_64-conda-linux-gnu
+Running under: Ubuntu 20.04.6 LTS
+
+Matrix products: default
+BLAS/LAPACK: /shared/ifbstor1/software/miniconda/envs/r-4.4.1/lib/libopenblasp-r0.3.27.so;  LAPACK version 3.12.0
+
+locale:
+ [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
+ [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
+ [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
+ [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
+ [9] LC_ADDRESS=C               LC_TELEPHONE=C            
+[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
+
+time zone: Europe/Paris
+tzcode source: system (glibc)
+
+attached base packages:
+[1] stats4    stats     graphics  grDevices utils     datasets  methods  
+[8] base     
+
+other attached packages:
+ [1] EnhancedVolcano_1.22.0      ggrepel_0.9.6              
+ [3] UpSetR_1.4.0                SingleCellExperiment_1.26.0
+ [5] SummarizedExperiment_1.34.0 Biobase_2.64.0             
+ [7] GenomicRanges_1.56.2        GenomeInfoDb_1.40.1        
+ [9] IRanges_2.38.1              S4Vectors_0.42.1           
+[11] BiocGenerics_0.50.0         MatrixGenerics_1.16.0      
+[13] matrixStats_1.4.1           Seurat_5.1.0               
+[15] SeuratObject_5.0.2          sp_2.1-4                   
+[17] dplyr_1.1.4                 knitr_1.48                 
+[19] rstatix_0.7.2               ggpubr_0.6.0               
+[21] ggplot2_3.5.1               DT_0.33                    
+[23] BiocParallel_1.38.0        
+
+loaded via a namespace (and not attached):
+  [1] RcppAnnoy_0.0.22        splines_4.4.1           later_1.3.2            
+  [4] tibble_3.2.1            polyclip_1.10-7         rpart_4.1.23           
+  [7] fastDummies_1.7.4       lifecycle_1.0.4         globals_0.16.3         
+ [10] lattice_0.22-6          MASS_7.3-61             crosstalk_1.2.1        
+ [13] backports_1.5.0         magrittr_2.0.3          limma_3.60.6           
+ [16] Hmisc_5.1-3             plotly_4.10.4           sass_0.4.9             
+ [19] rmarkdown_2.28          jquerylib_0.1.4         yaml_2.3.10            
+ [22] httpuv_1.6.15           sctransform_0.4.1       spam_2.11-0            
+ [25] spatstat.sparse_3.1-0   reticulate_1.39.0       cowplot_1.1.3          
+ [28] pbapply_1.7-2           RColorBrewer_1.1-3      abind_1.4-8            
+ [31] zlibbioc_1.50.0         Rtsne_0.17              purrr_1.0.2            
+ [34] nnet_7.3-19             GenomeInfoDbData_1.2.12 irlba_2.3.5.1          
+ [37] listenv_0.9.1           spatstat.utils_3.1-0    goftest_1.2-3          
+ [40] RSpectra_0.16-2         spatstat.random_3.3-2   fitdistrplus_1.2-1     
+ [43] parallelly_1.38.0       DelayedArray_0.30.1     leiden_0.4.3.1         
+ [46] codetools_0.2-20        tidyselect_1.2.1        UCSC.utils_1.0.0       
+ [49] farver_2.1.2            base64enc_0.1-3         spatstat.explore_3.3-2 
+ [52] jsonlite_1.8.9          progressr_0.14.0        Formula_1.2-5          
+ [55] ggridges_0.5.6          survival_3.7-0          tools_4.4.1            
+ [58] ica_1.0-3               Rcpp_1.0.13             glue_1.8.0             
+ [61] SparseArray_1.4.8       gridExtra_2.3           DESeq2_1.44.0          
+ [64] xfun_0.48               withr_3.0.1             fastmap_1.2.0          
+ [67] fansi_1.0.6             digest_0.6.37           R6_2.5.1               
+ [70] mime_0.12               colorspace_2.1-1        scattermore_1.2        
+ [73] tensor_1.5              spatstat.data_3.1-2     utf8_1.2.4             
+ [76] tidyr_1.3.1             generics_0.1.3          data.table_1.16.2      
+ [79] S4Arrays_1.4.1          httr_1.4.7              htmlwidgets_1.6.4      
+ [82] uwot_0.2.2              pkgconfig_2.0.3         gtable_0.3.5           
+ [85] lmtest_0.9-40           XVector_0.44.0          htmltools_0.5.8.1      
+ [88] carData_3.0-5           dotCall64_1.2           scales_1.3.0           
+ [91] png_0.1-8               spatstat.univar_3.0-1   rstudioapi_0.17.0      
+ [94] reshape2_1.4.4          checkmate_2.3.1         nlme_3.1-165           
+ [97] cachem_1.1.0            zoo_1.8-12              stringr_1.5.1          
+[100] KernSmooth_2.23-24      vipor_0.4.7             parallel_4.4.1         
+[103] miniUI_0.1.1.1          foreign_0.8-86          ggrastr_1.0.2          
+[106] pillar_1.9.0            grid_4.4.1              vctrs_0.6.5            
+[109] RANN_2.6.2              promises_1.3.0          car_3.1-3              
+[112] xtable_1.8-4            cluster_2.1.6           beeswarm_0.4.0         
+[115] htmlTable_2.4.2         evaluate_1.0.1          locfit_1.5-9.9         
+[118] cli_3.6.3               compiler_4.4.1          crayon_1.5.3           
+[121] rlang_1.1.4             future.apply_1.11.2     ggsignif_0.6.4         
+[124] labeling_0.4.3          ggbeeswarm_0.7.2        plyr_1.8.9             
+[127] stringi_1.8.4           viridisLite_0.4.2       deldir_2.0-4           
+[130] munsell_0.5.1           lazyeval_0.2.2          spatstat.geom_3.3-3    
+[133] Matrix_1.7-1            RcppHNSW_0.6.0          patchwork_1.3.0        
+[136] future_1.34.0           statmod_1.5.0           shiny_1.9.1            
+[139] highr_0.11              ROCR_1.0-11             igraph_2.1.1           
+[142] broom_1.0.7             bslib_0.8.0            
+
+
+ +
---
title: "<CENTER>EBAII n1 2024<BR /><B>Differential expression analysis</B></CENTER>"
date: "`r Sys.Date()`"
author:
  - name: "Thibault DAYRIS"
    email: "thibault.dayris@gustaveroussy.fr"
  - name: "Bastien JOB"
    email: "bastien.job@gustaveroussy.fr"
output:
  html_document:  # Defautl view
    highlight: tango  ## Theme for the code chunks
    number_sections: true  ## Adds number to headers (sections)
    theme: flatly  ## CSS theme for the HTML page
    toc: true  ## Adds a table of content
    toc_float:  ## TOC options
      collapsed: true  ## By default, the TOC is folded
      smooth_scroll: true ## Smooth scroll of the HTML page
    self_contained: true ## Includes all plots/images within the HTML
    code_download: true ## Adds a button to download the Rmd
    code_folding: show
    thumbnails: false
    lightbox: true
    fig_caption: true
    gallery: true
    use_bookdown: true
always_allow_html: true ## Allow plain HTML code in the Rmd
editor_options: 
  markdown: 
    wrap: 88
---


<!-- Add the Roscoff banner -->

```{css banner, echo = FALSE}
body {
  background-image: url('ebaii_banner.png');
  background-repeat: no-repeat;
  background-size: 100%;
  margin: 10%
}

div {
  background-color: rgba(255, 255, 255, 0.35)   /* 35% opaque white */;
  padding: 0.25em;
}
```

<!-- Allows to hide the TOC by default, display it with a button, move it to the right or left of the page -->

```{r setup, include=FALSE, echo=FALSE}
knitr::opts_chunk$set(
  echo = TRUE,        # Print the code
  eval = TRUE,       # Do not run command lines
  message = FALSE,    # Print messages
  prompt = FALSE,     # Do not display prompt
  comment = NA,       # No comments on this section
  warning = FALSE,     # Display warnings
  tidy = FALSE,
  fig.align = "center",
  width = 100       # Number of characters per line
)
hooks = knitr::knit_hooks$get()
hook_foldable = function(type) {
  force(type)
  function(x, options) {
    res = hooks[[type]](x, options)

    if (isFALSE(options[[paste0("fold.", type)]])) return(res)

    paste0(
      "<details><summary>Show ", type, "</summary>\n\n",
      res,
      "\n\n</details>"
    )
  }
}
Hmisc::hidingTOC(
  buttonLabel = 'Show TOC', 
  hidden = TRUE, 
  tocSide = 'left', 
  buttonSide='left', 
  posCollapse = 'margin', 
  levels = 3
)
my_seed <- 1337L
```

<!-- CSS to color chunks and outputs -->

```{css chunks, echo = FALSE}
div.beyond pre { background-color: pink; color : black; }
div.beyond pre.r { background-color: lightblue; border: 3px solid blue; }
div.notrun pre { background-color: lightyellow; color : brown; }
div.notrun pre.r { background-color: lightgrey; border: 3px solid black; }
```

<style type="text/css">
details:hover { 
  cursor: pointer 
}
body {
  text-align: justify
}
.column-left{
  float: left;
  width: 47%;
  text-align: left;
}
.column-right{
  float: right;
  width: 47%;
  text-align: left;
}
</style>


# Forewords

I need three teams for this session: 
team [`Wilcoxon`](https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test), 
team [`Student`](https://en.wikipedia.org/wiki/Student%27s_t-test), 
and team [`ROC`](https://en.wikipedia.org/wiki/Receiver_operating_characteristic).

I will also need your help because I can't make [DESeq2](https://bioconductor.org/packages/release/bioc/html/DESeq2.html) 
work correctly. But I'm sure, that we will solve my issue: you're in the best 
session here at [EBAII](https://github.com/IFB-ElixirFr/EBAII).

## TLDR: R command lines

In this presentation, there will be screen captures for you to follow the 
lesson. There will also be every single R command lines. 
Do not take care of the command lines if you find them too challenging. 
Our goal here, is to understand the main mechanism of Differential 
Expression Analysis. R is just a tool.

Below are the libraries we need to perform this whole session:

```{r load_libraries, eval=TRUE, echo=TRUE}
base::library(package = "BiocParallel")    # Optionally multithread some steps
base::library(package = "DT")              # Display nice table in HTML
base::library(package = "ggplot2")         # Draw graphs and plots
base::library(package = "ggpubr")          # Draw nicer graphs
base::library(package = "rstatix")         # Base R statistics
base::library(package = "knitr")           # Build this presentation
base::library(package = "dplyr")           # Handle big tables
base::library(package = "Seurat")          # Handle SingleCell analyses
base::library(package = "SeuratObject")    # Handle SingleCell objects
base::library(package = "SingleCellExperiment") # Handle SingleCell file formats
base::library(package = "UpSetR")          # Nice venn-like graphs
base::library(package = "EnhancedVolcano") # Draw Volcano plot
```

First, we load Seurat object:

```{r load_rds, eval=FALSE, echo=TRUE}
sobj <- base::readRDS(
  # Path to the RDS file
  file = "/shared/projects/ebaii_sc_teachers/SC_TD/06_Integration/RESULTS/12_TD3A.TDCT_S5_Integrated_12926.3886.RDS"
)
```

Then we join layers:

```{r join_layers_tldr, eval=FALSE, echo=TRUE}
Seurat::Idents(sobj) <- sobj$orig.ident
sobj <- SeuratObject::JoinLayers(sobj)
```

Then we perform differential expression analysis:

```{r run_dea, eval=FALSE, echo=TRUE}

sobj_de <- Seurat::FindMarkers(
  # Object on which to perform DEA
  object = sobj,
  # Name of factor in condition 1
  ident.1 = "TD3A",
  # Name of factor in condition 2
  ident.2 = "TDCT"
)
```

And that's all. Our goal is to understand these lines,
being able to write them is a bonus.

## Purpose of this session

Up to now, we have:

1. Identified to which cell each sequenced reads come from
1. Identified to which gene each read come from
1. Identified possible bias in gene expression for each cell
1. Filtered and corrected these bias as well as we can

We would like to identify the list of genes that characterize differences 
between cell cycle phases G1 and S groups.

At the end of this session you will know:

1. how to select a differential analysis method
1. how to select the correct normalization (if any?) that must be provided to your differential analysis method
1. How to read differential expression results


## Load RDS dataset

You already have a dataset loaded ? Good. Keep on going with it ! You don't have one ? Use mine:

```{r load_rds_exec, eval=TRUE, echo=TRUE}
# The function `readRDS` from package `base`.
sobj <- base::readRDS(
  # Path to the RDS file
  file = "/shared/projects/ebaii_sc_teachers/SC_TD/06_Integration/RESULTS/12_TD3A.TDCT_S5_Integrated_12926.3886.RDS"
)
```

The command above, uses the function [`readRDS`](https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/readRDS) 
from `base` R package. and git it the path to the [RDS](https://riptutorial.com/r/example/3650/rds-and-rdata--rda--files) 
file we built in previous session.

## 

## Insight

We are wondering what's in our dataset. Let's have a look, 
with the function [`print`](https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/print) 
form the package `base`.

```{r print_seurat, eval=TRUE, echo=TRUE}
base::print(
  # Object to display
  x = sobj
)
```

We have `r base::dim(SeuratObject::GetAssayData(object = sobj, assay = "RNA", layer = "data.TD3A"))[1]` features (_aka_ genes), 
across `r base::dim(SeuratObject::GetAssayData(object = sobj, assay = "RNA", layer = "data.TD3A"))[2]` samples (_aka_ cells) in the TD3A condition. We have `r base::dim(SeuratObject::GetAssayData(object = sobj, assay = "RNA", layer = "data.TDCT"))[1]` features (_aka_ genes), 
across `r base::dim(SeuratObject::GetAssayData(object = sobj, assay = "RNA", layer = "data.TDCT"))[2]` samples (_aka_ cells) in the TD3A condition.


Let us have a look at the RNA counts for 10 cells and their annotation,
with the function [`head`](https://www.rdocumentation.org/packages/utils/versions/3.6.2/topics/head) from the package `utils`.

```{r head_seurat_counts_phase, eval=FALSE, echo=TRUE}
utils::head(
  # Object to visualize
  x = sobj,
  # Number of lines to display
  n = 10
)
```


```{r head_seurat_counts_phase_dt, eval=TRUE, echo=FALSE}
tmp <- utils::head(x = sobj, n = 10)
DT::datatable(data = tmp)
```

There is no counts, normalized or not. Where are they ?

In order to explore the content of `sobj`, use the function `str` from the package `utils`:

```{r str_seurat_object, eval=FALSE, echo=TRUE}
utils::str(
  # Object to explore
  object = sobj@assays
)
```

Alternatively, in RStudio, you can click on the object pane and explore manually 
the content of the object. If we explore the slot `assays`, then we find the counts.

You can access them with:

```{r head_seurat_count_table, eval=FALSE, echo=TRUE}
utils::head(
  # Object to visualize
  x = SeuratObject::GetAssayData(object = sobj, assay = "RNA", layer = "data.TD3A"),
  # Number of rows to display
  n = 10
)
```

```{r head_seurat_count_table_dt, eval=TRUE, echo=FALSE}
tmp <- utils::head(
  x = SeuratObject::GetAssayData(object = sobj, assay = "RNA", layer = "data.TD3A"), 
  10
)
DT::datatable(
  data = base::as.data.frame(tmp)[, 1:5]
)
```

We have one gene per line, one cell per column, and RNA counts in each row.


For the sake of this session, we won't compare the whole dataset. It would take
up to 15 minutes to complete. During the rest of this session, we will compare
clusters 8 and 10, as annotated with Harmony.

We need to re-annotate layers to do so. This is done in two steps: 

1. redefine [`Idents`](https://www.rdocumentation.org/packages/Seurat/versions/3.1.4/topics/Idents)
in order to be able to call cells by their cluster names.
1. [`JoinLayers`](https://www.rdocumentation.org/packages/SeuratObject/versions/5.0.2/topics/JoinLayers) to have a single count table with both TD3A and TDCT cells from clusters 8 and 10 together.

```{r ident_and_join_layers, eval=TRUE, echo=TRUE}
Seurat::Idents(sobj) <- sobj$HarmonyStandalone_clusters
sobj <- SeuratObject::JoinLayers(sobj)
```

Let's check if our cells are joint:

```{r check_joint_layers}
base::print(sobj)
```


We have one gene per line, one cell per column, and RNA counts in each row.
Are these counts normalized ? Are they scaled ? Are they filtered ? Are 
they corrected ?

<details>

<summary>Answer</summary>

These counts are normalized, scaled, filtered. This information is 
available in the seurat object itself, within the slot `commands`. See an
example below:

```{r seurat_history, eval=TRUE, echo=TRUE}
names(sobj@commands)
```

**However**, please be aware that counts in the slot `count` are raw counts.
Normalized counts are in the slot `data` and scaled data are in the slot 
`scaled.data`. And it you do not find that clear, I totally agree with you.

</details>
<br />


Is it normal that we have so many zeros ? And what about surch low counts,
is it because we downsampled the sequenced reads for this session ?

<details>

<summary>Answer</summary>

The large number of null counts is completely normal. In maths/stats
we talk about _matrix sparcity_, _i.e_ a table with lots of zeros. If the
data were to be downsampled, we had had done this by cropping over a small
chromosome, and not reducing the general amount of reads.

<br />
</details>

# Select a DE method

## Available methods

[Seurat](https://satijalab.org/seurat/articles/de_vignette) let us use multiple 
differential analysis methods with its function [`FindMarkers`](https://satijalab.org/seurat/reference/findmarkers).

1. [wilcox](https://www.rdocumentation.org/packages/rstatix/versions/0.7.1): The wilcoxon test tests the mean of expression and looks for a difference in these means.
1. [MAST](https://www.bioconductor.org/packages/release/bioc/vignettes/MAST/inst/doc/MAST-Intro.html): This tool has been built for Single Cell. It is based on a statistical model called ["Hurdle Model"](https://en.wikipedia.org/wiki/Hurdle_model), which excells with data that contains lots of zeros (which is our case in Single Cell RNA-Seq: most of the genes are *not* expressed.)
1. [DESeq2](https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#recommendations-for-single-cell-analysis): This tool has originally been built for bulk RNA-Seq but now includes specific functions for Single Cell. It performs well when counts are highly variable or when you wand to compare a handful of cells.
1. [t-test](https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/t.test): The t-test uses a comparison of means to assess differential expression probability.
1. [roc](https://en.wikipedia.org/wiki/Receiver_operating_characteristic): An AUC value of 1 means that expression values for this gene alone can perfectly classify the two groupings (i.e. Each of the cells in cells.1 exhibit a higher level than each of the cells in cells.2). An AUC value of 0 also means there is perfect classification, but in the other direction. A value of 0.5 implies that the gene has no predictive power to classify the two groups.

The main question now is **how to choose the right test**: spoilers, there are no option better than another in all ways.

From Soneson & Robinson (2018) Nature Methods:

![de_tools](images/DE_tools.png)

Here, researchers have designed an artificial dataset where they knew in advance the list of differentially expressed genes. They have used all these algorithms and consigned the results.

1. DESeq2, Limma seems to have a higher number of false positives (genes called differentially expressed while they were not.)
1. Wilcoxon seems to be better in general
1. Mast performs well in absolute

ANOVA was not present in this study.

## Case: Plac8

The question is now to guess whether this gene is differnetially expressed or not.

### Cell observations

Let's have a look at the gene named ['Plac8'](https://www.genecards.org/cgi-bin/carddisp.pl?gene=PLAC8&keywords=plac8), 
involved in positive regulation of cold-induced thermogenesis and positive 
regulation of transcription by RNA polymerase II. In order to plot its 
expression across all cells, we are going to use the function 
[`VlnPlot`](https://satijalab.org/seurat/reference/vlnplot)
from `Seurat` package. The input object is obviously contained in the `sobj`
variable, since it is our only Seurat object. In addition, we are going to
select the feature `Plac8`, and split the graph according to the clusters
we annotated earlier.

```{r seurat_vlnplot_Plac8_demo, eval=TRUE, echo=TRUE}
Seurat::VlnPlot(
  # A subset of the Seurat object
  # limited to clusters 8 and 10, 
  # or else we will plot all the clusters
  object = subset(sobj, HarmonyStandalone_clusters %in% c("8", "10")),
  # The name of the gene of interest (feature = gene)
  features = "Plac8",
  # The name of the Seurat cell annotation
  split.by = "HarmonyStandalone_clusters",
  # Change color for presentation
  cols = c("darkslategray3", "olivedrab")
)
```

Using your _'intuition'_, is this gene differentially expressed between cluster
8 and cluster 10 ?

<details>

<summary>Answer</summary>

In cluster 10, the violin plot highlights almost no cells with low or zero `Plac8`
expression. The highest density of cells has a `Plac8` normalized expression
aroung 1.5.

In Cluster 8, cells seems to have no expression of `Plac8` at all.

IMHO, and you can disagree, the expression of the gene `Plac8` differs between
cluster 8 and cluster 10. This is purely intuitive.

</details>

Using your _'intuition'_, is this gene differentially expressed between G1 and S phases ?


<br />
<details>

<summary>Answer</summary>

```{r vlnplot_seurat_group_phase_code, echo=TRUE, eval=TRUE}
Seurat::VlnPlot(
  # The Seurat object
  object = sobj,
  # The name of the gene of interest (feature = gene)
  features = "Plac8",
  # The name of the Seurat cell-cycle annotation
  group.by = "CC_Seurat_Phase",
  # Change color for presentation
  cols = c("darkslategray3", "olivedrab", "orangered3")
)
```

Most of the expression is null, some cells express the gene.

<br />
</details>

Okay, let's have some informations about these distributions.

```{r general_count_table, eval=TRUE, echo=TRUE}
# Store counts in a variable
counts <- base::as.data.frame(
  # The matrix to reformat into a dataframe
  x = SeuratObject::GetAssayData(object = sobj, assay = "RNA", layer = "data")
)
# Rename cells with cell harmony cluster
base::colnames(counts) <- paste(
  # The names of the cell cluster for each cell
  sobj$CC_Seurat_Phase,
  # The names of the cells themselves
  colnames(sobj),
  sep = "_"
)
```

```{r general_count_table_display, eval=TRUE, echo=FALSE}
DT::datatable(head(counts))
```

<div class="column-left">
We have `r length(colnames(sobj[, sobj$CC_Seurat_Phase == "G1"]))` cells within the G1 group:

```{r Plac8_summaries_G1, eval=TRUE}
countsG1 <- select(counts, matches("^G1."))

Plac8G1 <- base::as.data.frame(base::t(countsG1["Plac8", ]))

base::summary(Plac8G1)
```
</div>
<div class="column-right">
We have `r length(colnames(sobj[, sobj$CC_Seurat_Phase == "S"]))` cells withing the S group:

```{r Plac8_summaries_S, eval=TRUE}
countsS <- select(counts, matches("^S."))

Plac8S <- base::as.data.frame(base::t(countsS["Plac8", ]))

base::summary(Plac8S)
```
</div>

### From biology to statistics

Okay, let us resort on statistics to evaluate our chances to be guess correctly, or our risks to guess wrong.

We have lots of observations: `r base::length(base::colnames(sobj[, sobj$CC_Seurat_Phase == "G1"]))` 
cells within the G1 phase, and `r base::length(base::colnames(sobj[, sobj$CC_Seurat_Phase == "S"]))` 
cells withing the S phase.Statisticians really like to have a lot of 
observations! Ideally, statisticians always want to have more observation 
than tests. We have a total of `r base::length(base::colnames(sobj[, sobj$CC_Seurat_Phase == "G1"])) + base::length(base::colnames(sobj[, sobj$CC_Seurat_Phase == "S"]))` 
observations and we are testing 1 gene. For them, this is a dream come true!

Are our cells supposed to be interacting with each others ? 
Are they independent from each others ? 
This is very important, and usually, it requires a discussion.

oes the expression in our cells follow a normal distribution ? 
It's easy to check. Let's draw the expression like we did above.

First, we use the function [`rbind`](https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/cbind) 
from `base` package. Be careful, the function `rbind` also exists 
in `DelayedArray`, `data.table`, and `BiocGenerics` packages. 
We want to use the basic one. Here is an example of odd behaviors 
that occurs when not using the package name before a function call.

```{r distribution_Plac8_table_build, eval=TRUE, echo=TRUE}
# Add column idientifiers in each count dataframes
Plac8G1$phase <- "G1"
Plac8S$phase <- "S"
# Paste the rows beneith each other
Plac8 <- base::rbind(
  #  variable pointing to G1 counts
  Plac8G1,
  #  variable pointing to S counts
  Plac8S,
  # A Boolean, requesting that strings/characters
  # should not be casted as `logical`. It breaks graphs.
  stringsAsFactors = FALSE
)
```

Secondly, we use the function [`gghistogram`](https://www.rdocumentation.org/packages/ggpubr/versions/0.6.0/topics/gghistogram)
from package `ggpubr` in order to display relative abundance of gene expression:

```{r distribution_Plac8_display, eval=TRUE, echo=TRUE}
ggpubr::gghistogram(
  Plac8,
  x = "Plac8",
  y = '..density..',
  fill = "steelblue",
  bins = 15,
  add_density = TRUE
)
```


Our distribution doesn't seems to be normal, nor binomial we will have to rely on non-parametric tests.

Let's run a non-parametric test base on the mean of distributions, 
since it's the clothest to our 'intuitive' approach. Let there be 
[Wilcoxon](https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test) test.

In R, it's quite straightforward: we have the function 
[`wilcoxon_test`](https://www.rdocumentation.org/packages/rstatix/versions/0.7.1) 
to perform the test, then we can plot the result.

```{r wilcoxon_Plac8, eval = TRUE, echo=TRUE}
# On the expression table stored in the varialbe `Plac8`,
# first apply the function `wilcox_test` from package `rstatix`,
# then we apply the function `add_significance` from package `rstatix`
stat.test <- Plac8 %>% rstatix::wilcox_test(Plac8 ~ phase) %>% rstatix::add_significance()
# While doing so, we usually also compute effect size
# eff.size <- Plac8 %>% rstatix::wilcox_effsize(Plac8 ~ phase)
```

```{r display_wilcoxon_Plac8_result, echo = FALSE, eval = TRUE}
DT::datatable(stat.test, caption = "Wilcoxon test result")
stat.test <- Plac8 %>% rstatix::t_test(Plac8 ~ phase) %>% rstatix::add_significance()
```

Wilcoxon test says: the distributions are different, with a `r stat.test$p * 100` 
% of risks of being wrong. The gene Plac8 can safely be said differentially 
expressed. We can compute a fold change and conclude.

Just out of curiosity, be aware that [`t_test`](https://www.rdocumentation.org/packages/rstatix/versions/0.7.2/topics/t_test)
from `rstatix` package, gives the same answer. However,
[`DESeq`](https://satijalab.org/seurat/reference/findmarkers) gives a p-value
of 0.05 and an adjusted p-value equal to 1.

Depending on the test used, this gene is, or is not differentially expressed.


## Conclusion

In conclusion, to select your method, use the following:

1. If you have already done a study with one of these methods, keep using the same. This is crutial if you ever want to compare your new result with the old ones.
1. If you want to compare your study with a published one, use the same method.
1. If you have no idea, use Wilcoxon.
1. If you have bulk data analyzed with DESeq2/Limma, use DESeq2/Limma. It will be easier to take both works in consideration.

**Please, never use a simple Wilcoxon on bulk RNA-Seq data.**

# Select a dataset

## Dataset depends on selected method

There it is quite easier:

```{r choose_counts, eval=TRUE, results='asis', echo=FALSE}
choose_counts <- as.data.frame(t(data.frame(
  wilcox = c("Normalized counts", "sojb@assays[['RNA']]@data"),
  t_test = c("Normalized counts", "sojb@assays[['RNA']]@data"),
  ROC = c("Normalized counts", "sojb@assays[['RNA']]@data"),
  ANOVA = c("Normalized counts", "sojb@assays[['RNA']]@data"),
  MAST = c("Raw counts", "sojb@assays[['RNA']]@counts"),
  DESeq2 = c("Raw counts", "sojb@assays[['RNA']]@counts"),
  Limma = c("Raw counts", "sojb@assays[['RNA']]@counts")
)))
colnames(choose_counts) <- c("Counts", "slot name")
DT::datatable(choose_counts, caption = "How to select your counts")
```

## FindMarkers

With the function [`FindMarkers`](https://satijalab.org/seurat/reference/findmarkers) 
from package `Seurat`, we want to make three groups: 

1. One using `wilcoxon` to perform DEA between "G1" and "S" phases.
1. One using `t`-test to perform DEA between "G1" and "S" phases.
1. One using `ROC` to perform DEA between "G1" and "S" phases.

We will observe the results and compare our gene lists.

Hey, why are you looking at me? It's your turn to work! Use the all the
notions seen above to select the right counts (`slot`), the right input
object, and the right arguments.

10 minutes practice !

<details>

<summary>Answers</summary>

Here are the code for each team:

```{r findmarkers_all_de, echo=TRUE, eval=TRUE}
sobj_wilcox <- Seurat::FindMarkers(
  # The variable that contains Seurat Object
  object = sobj,
  # Name of condition 1
  ident.1 = "8",
  # Name of condition 2
  ident.2 = "10",
  # Factor name in the Seurat Object
  group.by = "HarmonyStandalone_clusters",
  # Differential analysis method
  test.use = "wilcox"
)

sobj_t <- Seurat::FindMarkers(
  # The variable that contains Seurat Object
  object = sobj,
  # Name of condition 1
  ident.1 = "8",
  # Name of condition 2
  ident.2 = "10",
  # Factor name in the Seurat Object
  group.by = "HarmonyStandalone_clusters",
  # Differential analysis method
  test.use = "t"
)

sobj_roc <- Seurat::FindMarkers(
  # The variable that contains Seurat Object
  object = sobj,
  # Name of condition 1
  ident.1 = "8",
  # Name of condition 2
  ident.2 = "10",
  # Factor name in the Seurat Object
  group.by = "HarmonyStandalone_clusters",
  # Differential analysis method
  test.use = "roc"
)
```

```{r load_dea, eval=TRUE, echo=FALSE}
base::saveRDS(file="sobj_wilcox.RDS", object=sobj_wilcox)
```

</details>
<br />

In the function argument, there is a FoldChange threshold. Should we
filter gene expression based on FoldChange? In case of positive answer,
how much should that threshold be?

<details>

<summary>Answer</summary>


About thresholds on FDR (False Discovery Rate) and Log2(FC) (Log of the Fold Change), there are many discussions.

Remember, here the threshold on Fold Change is Logged. A `log2(1) = ``r log2(1)`. And keep in mind the following:

1. If one selects a fold change threshold above 1.7, then their study concludes that smoking is not related to lung cancer.
1. If one selects a fold change threshold above 1, then their study concludes that a fast-food based diet does not lead to weight gain.
1. If one selects a fold change threshold above 1/25 000 000, then their study concludes: it is a complete hazard that mice have featal malformation when in presence of Bisphanol.

In conclusion, there one, and only one reason to filter on fold change: in my experience, a fold change below 0.7 will be hard to see/verify on wet-lab (qRT).

If you need to reduce a too large number of differentially expressed genes, then reduce the FDR to 0.01, or even better, to 0.001. With that, you reduce your number of false claims.

</details>
<br />

Can you help me with `DEseq2`?

When I run the following command line, I have an error :

```{r seurat_run_deseq_with_error, eval=FALSE, echo=TRUE}
sobj_deseq2 <- Seurat::FindMarkers(
  # The variable that contains Seurat Object
  object = sobj,
  # Name of condition 1
  ident.1 = 8,
  # Name of condition 2
  ident.2 = 10,
  # Factor name in the Seurat Object
  group.by = "HarmonyStandalone_clusters",
  # Differential analysis method
  test.use = "deseq2",
  # Use non-normalized data with DESeq2
  slot = "counts"
)
```

> Error in PerformDE(object = object, cells.1 = cells.1, cells.2 = cells.2,  : 
>   Unknown test: deseq2


<details>

<summary>Answer</summary>

Oh! My fault, it was a typo in my command! Thank you all for your help!

```{r build_count_p1_matrix, eval=TRUE, echo=TRUE}
sobj_deseq2 <- Seurat::FindMarkers(
  # The variable that contains Seurat Object
  object = sobj,
  # Name of condition 1
  ident.1 = "8",
  # Name of condition 2
  ident.2 = "10",
  # Factor name in the Seurat Object
  group.by = "HarmonyStandalone_clusters",
  # Differential analysis method
  test.use = "DESeq2",
  # Use non-normalized data with DESeq2
  slot = "counts"
)
```

Remark: by doing surch modification, some fold changes have been modified:
remember the gene Atad2 with a mean expression of 0.08 in G1, and 0.2 in S 
phases? Mean expressions are now around 1.08 for G1, and 1.2 for S phases.
This may be the reason why it was not differentially expressed in DESeq2,
while Wilcoxon and T-test returned adjusted pvalues far below 0.05.

</details>



```{r save_de_results, eval=TRUE, echo=FALSE}
base::saveRDS(sobj_wilcox, "sobj_wilcox.RDS")
base::saveRDS(sobj_t, "sobj_t.RDS")
base::saveRDS(sobj_roc, "sobj_roc.RDS")
base::saveRDS(sobj_deseq2, "sobj_deseq2.RDS")
```

# Explore results

## Big tables

Let us have a look at the results:

```{r sobj_w_res_display, eval=TRUE, echo=FALSE}
DT::datatable(
  head(sobj_wilcox, n = 10),
  caption = "Wilcoxon test results"
)
```

We have the following columns:

1. `p_val`: Ignore this column. Always ignore raw p-values. Look at corrected ones, and if they are missing, then compute them.
1. `avg_log2FC`: Average Log2(FoldChange). Illustrates how much a gene is differentially expessed between samples in each condition.
1. `pct.1`: Percent of cells with gene expression in condition one, here in "G1" phase.
1. `pct.2`: Percent of cells with gene expression in condition two, here in "S" phase.
1. `p_val_adj`: The confidence we have in the result. The closer to 0, the lesser is the risk of error.


```{r sobj_t_res_display, eval=TRUE, echo=FALSE}
DT::datatable(
  head(sobj_t, n = 10),
  caption = "T-test results"
)
```

We have the following columns:

1. `p_val`: Ignore this column. Always ignore raw p-values. Look at corrected ones, and if they are missing, then compute them.
1. `avg_log2FC`: Average Log2(FoldChange). Illustrates how much a gene is differentially expessed between samples in each condition.
1. `pct.1`: Percent of cells with gene expression in condition one, here in "G1" phase.
1. `pct.2`: Percent of cells with gene expression in condition two, here in "S" phase.
1. `p_val_adj`: The confidence we have in the result. The closer to 0, the lesser is the risk of error.

```{r sobj_roc_res_display, eval=TRUE, echo=FALSE}
DT::datatable(
  head(sobj_roc, n = 10),
  caption = "ROC test results"
)
```

We have the following columns:

1. `myAUC`: The area under the curve
1. `avg_diff`: Average difference in AUC
1. `power`: `abs(AUC-0.5) * 2`, usefull to sort genes based on AUC
1. `pct.1`: Percent of cells with gene expression in condition one, here in "G1" phase.
1. `pct.2`: Percent of cells with gene expression in condition two, here in "S" phase.


```{r sobj_deseq_res_display, eval=TRUE, echo=FALSE}
DT::datatable(
  head(sobj_t, n = 10),
  caption = "T-test results"
)
```

We have the following columns:

1. `p_val`: Ignore this column. Always ignore raw p-values. Look at corrected ones, and if they are missing, then compute them.
1. `avg_log2FC`: Average Log2(FoldChange). Illustrates how much a gene is differentially expessed between samples in each condition.
1. `pct.1`: Percent of cells with gene expression in condition one, here in "G1" phase.
1. `pct.2`: Percent of cells with gene expression in condition two, here in "S" phase.
1. `p_val_adj`: The confidence we have in the result. The closer to 0, the lesser is the risk of error.


## Filter DEA results

What kind of threshold should be used to filter each results?

```{r extract_de_genes, eval=TRUE, echo=FALSE}
# We store a `list` in a variable called `data`
# The function `list` comes from `base` and not `biocGenerics`.
data <- base::list(
  # We use a threshold of 5% on adjusted p-values
  wilcox = base::rownames(sobj_wilcox[sobj_wilcox$p_val_adj <= 0.05, ]),
  # We use a threshold of 5% on adjusted p-values
  t_test = base::rownames(sobj_t[sobj_t$p_val_adj <= 0.05, ]),
  # We use a threshold of 0.2 in AUC power
  roc    = base::rownames(sobj_roc[sobj_roc$power >= 0.2, ]),
  # We use a threshold of 5% on adjusted p-values
  deseq2 = base::rownames(sobj_deseq2[sobj_deseq2$p_val_adj <= 0.05, ])
)
```

> If we must label certain scores as good or bad, we can reference the 
following rule of thumb:
>
> 0.5 = No discrimination
> 0.5-0.7 = Poor discrimination
> 0.7-0.8 = Acceptable discrimination
> 0.8-0.9= Excellent discrimination
> 0.9 = Outstanding discrimination

Hosmer and Lemeshow in Applied Logistic Regression (p. 177)

## Add results to Seurat objects

We'd like to store the results of differential expression analysis in 
the `Seurat` object.

```{r add_results_seurat, echo=TRUE, eval=TRUE}
sobj@misc$wilcox <- sobj_wilcox
```

```{r save_sobjw, eval=TRUE, echo=FALSE}
base::saveRDS(sobj, "DEA_Scaled_Normalized_Filtered.RDS")
```

## Common results

Now we can plot intersections in an up-set graph. It is like a venn diagramm:

```{r upset_seurat_de_methods, eval=TRUE, echo=TRUE}
UpSetR::upset(
  data = UpSetR::fromList(data),
  order.by = "freq"
)
```



## Heatmap

We'd like to display the expression of genes identified by FindMarkers. 
Then we use the function [`DoHeatmap`](https://satijalab.org/seurat/reference/doheatmap) 
from the package `Seurat`.

In order to limit the graph to differentially expressed reads, we use the
function [`rownames`](https://rdocumentation.org/packages/base/versions/3.6.2/topics/row.names)
from R `base` package on the DEA result table. In this example, I use
the results of wilcoxon, but you shall use any of the results you previously
obtained.

```{r seurat_heatmap, eval=TRUE, echo=TRUE}
Seurat::DoHeatmap(
  # variable pointing to seurat object
  object = sobj,
  # name of DE genes
  features = base::rownames(sobj_wilcox),
  # Cluster annotation
  group.by = "HarmonyStandalone_clusters",
)
```

## Volcano plot

A Volcano plot is usefull to identify differnetial expression
analysis bias.

The package `EnhancedVolcano` has an [eponym](https://bioconductor.org/packages/3.17/bioc/html/EnhancedVolcano.html)
function for that:

```{r enhanced_volcanoplot, eval=TRUE, echo=TRUE}
EnhancedVolcano::EnhancedVolcano(
  #  variable pointing to the DEA results
  toptable = sobj_wilcox,
  # Gene names
  lab = rownames(sobj_wilcox),
  # Column in which to find Fold Change
  x = "avg_log2FC",
  # Column in which to find confidence interval
  y = "p_val_adj",
  # Lower fold-change cut-off
  FCcutoff = 0.2
)
```

## Session Info

This list of all packages used while you work should be included
in each en every R presentation:

```{r session_info, eval=TRUE, echo=TRUE}
utils::sessionInfo()
```
+ + +
+
+ +
+ + + + + + + + + + + + + + + + + diff --git a/2024/evaiin1/SingleCell/DEA_GSEA/SingleCell_GSEA.Rmd b/2024/evaiin1/SingleCell/DEA_GSEA/SingleCell_GSEA.Rmd new file mode 100644 index 0000000..46c203f --- /dev/null +++ b/2024/evaiin1/SingleCell/DEA_GSEA/SingleCell_GSEA.Rmd @@ -0,0 +1,872 @@ +--- +title: "
EBAII n1 2024
Gene Set Enrichment Analysis
" +date: "`r Sys.Date()`" +author: + - name: "Thibault DAYRIS" + email: "thibault.dayris@gustaveroussy.fr" + - name: "Bastien JOB" + email: "bastien.job@gustaveroussy.fr" +output: + html_document: # Defautl view + highlight: tango ## Theme for the code chunks + number_sections: true ## Adds number to headers (sections) + theme: flatly ## CSS theme for the HTML page + toc: true ## Adds a table of content + toc_float: ## TOC options + collapsed: true ## By default, the TOC is folded + smooth_scroll: true ## Smooth scroll of the HTML page + self_contained: true ## Includes all plots/images within the HTML + code_download: true ## Adds a button to download the Rmd + code_folding: show + thumbnails: false + lightbox: true + fig_caption: true + gallery: true + use_bookdown: true +always_allow_html: true ## Allow plain HTML code in the Rmd +editor_options: + markdown: + wrap: 88 +--- + + + + +```{css banner, echo = FALSE} +body { + background-image: url('ebaii_banner.png'); + background-repeat: no-repeat; + background-size: 100%; + margin: 10% +} + +div { + background-color: rgba(255, 255, 255, 0.35) /* 35% opaque white */; + padding: 0.25em; +} +``` + + + +```{r setup, include=FALSE, echo=FALSE} +knitr::opts_chunk$set( + echo = TRUE, # Print the code + eval = TRUE, # Do not run command lines + message = FALSE, # Print messages + prompt = FALSE, # Do not display prompt + comment = NA, # No comments on this section + warning = FALSE, # Display warnings + tidy = FALSE, + fig.align = "center", + width = 100 # Number of characters per line +) +hooks = knitr::knit_hooks$get() +hook_foldable = function(type) { + force(type) + function(x, options) { + res = hooks[[type]](x, options) + + if (isFALSE(options[[paste0("fold.", type)]])) return(res) + + paste0( + "
Show ", type, "\n\n", + res, + "\n\n
" + ) + } +} +Hmisc::hidingTOC( + buttonLabel = 'Show TOC', + hidden = TRUE, + tocSide = 'left', + buttonSide='left', + posCollapse = 'margin', + levels = 3 +) +my_seed <- 1337L +``` + + + +```{css chunks, echo = FALSE} +div.beyond pre { background-color: pink; color : black; } +div.beyond pre.r { background-color: lightblue; border: 3px solid blue; } +div.notrun pre { background-color: lightyellow; color : brown; } +div.notrun pre.r { background-color: lightgrey; border: 3px solid black; } +``` + + + +# Forewords + +## TLDR: R command lines + +In this presentation, there will be screen captures for you to follow the +lesson. There will also be every single R command lines. +Do not take care of the command lines if you find them too challenging. +Our goal here, is to understand the main mechanism of Differential +Expression Analysis. R is just a tool. + +Below are the libraries we need to perform this whole session: + +```{r load_libraries, eval=TRUE, echo=TRUE} +base::library(package = "BiocParallel") # Optionally multithread some steps +base::library(package = "DT") # Display nice table in HTML +base::library(package = "ggplot2") # Draw graphs and plots +base::library(package = "ggpubr") # Draw nicer graphs +base::library(package = "rstatix") # Base R statistics +base::library(package = "knitr") # Build this presentation +base::library(package = "dplyr") # Handle big tables +base::library(package = "Seurat") # Handle SingleCell analyses +base::library(package = "SeuratObject") # Handle SingleCell objects +base::library(package = "SingleCellExperiment") # Handle SingleCell file formats +base::library(package = "escape") # Perform exploratory enrichment analysis +base::library(package = "clusterProfiler") # Perform GSEA analysis +base::library(package = "dittoSeq") # Draw nice plots based on Seurat +base::library(package = "org.Mm.eg.db") # Mouse genome annotation +base::library(package = "Cairo") # Graphs library +base::library(package = "pathview") # For the whole pathway graph +``` + +First, we load Seurat object: + +```{r load_rds_tldr, eval=TRUE, echo=TRUE} +sobj <- base::readRDS( + # Path to the RDS file + file = "DEA_Scaled_Normalized_Filtered.RDS" +) +``` + +Then we launch enrichment exploration on all counts: + +```{r escape_enrichment_tldr, eval=FALSE, echo=TRUE} +# Acquire gene sets +mh_hallmark <- escape::getGeneSets( + species = "Mus musculus", + library = "H" +) + +# Run enrichment +enrichment_matrix <- escape::escape.matrix( + input.data = sobj, + gene.sets = mh_hallmark, + method = "ssGSEA", + groups = 1000, + min.size = 5, + BPPARAM = BiocParallel::SnowParam(workers = 2), +) + +# Save enrichment in seurat object +sobj[["escape"]] <- SeuratObject::CreateAssay5Object( + data=t(enrichment_matrix), +) + +saveRDS(sobj, file = "sobj_GSEA.RDS") +``` + + +## Purpose of this session + +Up to now, we have: + +1. Identified to which cell each sequenced reads come from +1. Identified to which gene each read come from +1. Identified possible bias in gene expression for each cell +1. Filtered and corrected these bias as well as we can +1. Found differentially expressed genes across multiple conditions +1. Annotated cell clusters + +We would like to identify the functions of genes among several clusters, +or differentially expressed genes. + +At the end of this session you will know: + +1. What is gene set analysis +1. How to choose a Gene Set database +1. How to perform an enrichment analysis +1. How to read Gene set analysis results + + +# Select a database + +## Gene: Jund + +Let's search information about this gene on the web. For mice, one of the +best web-site for human is: [MGI](https://www.informatics.jax.org/). + +If we search for the gene [Plac8](https://www.informatics.jax.org/marker/MGI:2445289), +we find the following: + +![plac8_mgi](images/plca8_mgi.png) + +As we click on "Phenotype reference", we can see that the MGI database can give +us [PubMeb IDs](https://pubmed.ncbi.nlm.nih.gov/) related to the gene Plca8. +Below, on the same page, we can see regulation pathways, protein ontology, +gene interactions, etc. + +![plca8_pathway](images/plac8_pathways.png) + +We illustrate here, that GSEA does not only include gene-to-function, or +gene-to-pathway. They usually include many more information. + +## Database types + +![db_types](images/gene_sets.png) + +There is a database for pretty much everything you might want. From +pharmacology, to regulatory objects, etc. + +Our dataset contains gene expression. But the same methods can be applied to +other kind of datasets. _E.g._ + +![ppis](images/ppis.png) + +Protein-protein interactions, drug interactions, SNP-interactions, etc. + +Within R, you may find genome databases for a wide range of organisms: + +![org_bd](images/org_db.png) + +## Some noticeable databases + +```{r databases_example, eval=TRUE, echo=FALSE} +databases <- base::data.frame( + MSigDB = c("https://www.gsea-msigdb.org/gsea/index.jsp"), + KEGG = c("https://www.genome.jp/kegg/"), + GO = c("https://www.geneontology.org/"), + Ensembl = c("https://ftp.ensembl.org/pub/"), + Panther = c("https://pantherdb.org/"), + Jaspar = c("https://jaspar.genereg.net/"), + GWAScat = c("https://www.ebi.ac.uk/gwas/"), + BS_Genome = c("https://bioconductor.org/packages/release/BiocViews.html#___BSgenome"), + MeSH = c("https://www.nlm.nih.gov/mesh/meshhome.html"), + OrgDb = c("https://bioconductor.org/packages/release/BiocViews.html#___OrgDb"), + NCG = c("http://www.network-cancer-genes.org/"), + PharmGKB = c("https://www.pharmgkb.org/") +) + +base::rownames(databases) <- c("Address") + +DT::datatable(base::t(databases)) +``` + +For this session, we are going to use MSigBD. + +## How to perform GSEA + +There are two main types of Gene Set Enrichment Analysis: +1. A blind search among all genes in order to identify pathways or mechanisms among cells. +1. A specific search through genes of interest + +The first is called "Exploratory" analysis and starts on the `Seurat` object. + +Many packages perform this kind of exploratory analysis, I chose to present +[`escape`](https://bioconductor.org/packages/release/bioc/html/escape.html) +for its simple and intuitive usage. + +What really matters is the GSEA method used in back-end. There are several +very well known methods: [`fgsea`](https://bioconductor.org/packages/release/bioc/html/fgsea.html), +[`gsva`](https://www.bioconductor.org/packages/release/bioc/html/GSVA.html), +[`DOSE`](https://www.bioconductor.org/packages/release/bioc/html/DOSE.html), +and the [`MSigDB`](https://www.gsea-msigdb.org/gsea/index.jsp)'s tool. While +choosing your favorite R package to perform GSEA, if that package uses one +of these methods, then you should not have odd questions through your +publication process. + +These methods are very alike, based on the same statistical resorts, and +suffer for the same limitations. Let's illustrate it with examples. + +# Exploratory analysis + +## Gene sets + +Up to now, we already define what's a _gene set_: a group of genes of interest. +We also know these king of relationships are stored in [`GMT`](https://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats#GMT:_Gene_Matrix_Transposed_file_format_.28.2A.gmt.29) +files or [`TSV`](https://fr.wikipedia.org/wiki/Tabulation-separated_values) +files. + +Since the beginning of our week, we've been working with R, and the question +naturally comes: how to make/use gene sets in R? + +### Create your own gene set + +Very easily. We need a "named [list](https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/list) +of [vectors](https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/c)", +all coming from `base` R package. + +```{r named_list_vectors, eval=TRUE, echo=TRUE} +# We use the function list from base package +pathways <- base::list( + # We use the function `c` from base package + proliferation = base::c( + "Ifng", "Kras", "Mgat5", + "Prf1", "Trem1", "Trp53" + ), + growth = base::c( + "Cdkn2a", "Fos", "Ifnb1", + "Rras", "Apc", "Sell" + ) +) +utils::head(pathways) +``` + +This method is very useful when you want to compare your cells toward +unpublished data, gene sets from an article not sourced in official databases, +etc. + +### Load a gene set from disk + +We can also load a GMT file from a file. This is very usefull when we +expect both known and new pathways to be tested, or known pathways that +should be updated. + +This is done with the function [`read.gmt`](https://rdrr.io/bioc/clusterProfiler/man/read-gmt.html) +from the [`clusterProfiler`](https://bioconductor.org/packages/release/bioc/html/clusterProfiler.html) +package. + +```{r get_gene_sets_file, eval=TRUE, echo=TRUE} +pathways <- clusterProfiler::read.gmt( + gmtfile = "m5.mpt.v2023.1.Mm.symbols.gmt" +) +pathways <- BiocGenerics::lapply( + S4Vectors::split( + pathways[-1], pathways$term + ), + function(x) x[x != ""] +) +utils::head(pathways) +``` + +### Load a gene set from the web + +We can also get pathways from the web using the function [`getGeneSets`](https://rdrr.io/bioc/escape/man/getGeneSets.html) +from package [`escape`](https://rdrr.io/bioc/escape/). + +```{r get_gene_sets_msigdb, eval=TRUE, echo=TRUE} +# Pay attention to the genome name, it's case sensitive +# and follows MSigDB naming scheme +mh_hallmark <- escape::getGeneSets( + species = "Mus musculus", + library = "H" +) +utils::head(pathways) +``` + +## Run exploratory analysis + +Let's time to practice. Use the function [`escape.matrix`](https://rdrr.io/github/ncborcherding/escape/man/escape.matrix.html) +from the package [`escape`](https://rdrr.io/bioc/escape/) in order to perform +GSEA on your [`Seurat`]() +object. Perform the analysis on cell cycle phase if possible. +Save the results in a variable, for example `sobj_enrich`. + +
+ +Answer + +It is not possible to specify what group we are considering. GSEA/Enrichment +analysis do not perform any differential analysis. + +```{r escape_enrich, eval=TRUE, echo=TRUE} +# Save in a variable called `sobj_enrich` +# the result of the function `runEscape` from package `escape` +enrichment_matrix <- escape::escape.matrix( + # Out Seurat object + input.data = sobj, + # Provide gene sets + gene.sets = mh_hallmark, + # Select your favorite method + method = "ssGSEA", + # ssGSEA parameters + groups = 1000, + min.size = 5, + # Use 2 threads/CPU/workers + BPPARAM = BiocParallel::SnowParam(workers = 2), +) +``` + +If you have reserved more than one core, this function can be multi-threaded, +making it faster ! + +
+
+ + +## Visualize results + + +We can visualise the gene sets enriched in cell clusters using the function [`FeaturePlot`](https://satijalab.org/seurat/reference/featureplot) from +`Seurat` package. We just need to color the cells with `ssGSEA`'s results: + +```{r unseen_load, eval=TRUE, echo=FALSE} +sobj <- readRDS("sobj_GSEA.RDS") +``` + +```{r featureplot_seurat_ssgsea, eval=TRUE, echo=TRUE} +Seurat::FeaturePlot(object = sobj, features = "HALLMARK-DNA-REPAIR") +``` + +# On purpose analysis + +In order to understand what happened, where do these results come from, +we need to go step by step and perform both enrichment and GSE analysis +manually. + +In general, the exploratory method is never used on a raw Seurat object, +we usually filter this object in order to search pathways that are related to +our biological question. + +With the previous method, what ever our biological question was, the cell +cycle phase was one of the top gene expression drivers. + +## Gene Names and Gene identifiers + +Let's search information about the gene `ARP2`. + +![arp2_multiple](images/arp2_multiple_genome.png) + +So, this gene exists in multiple organisms. Thanks, we know we are working +on mice, and we told gene set retriever function to consider 'mouse' datasets. + +![arp2_names](images/arp2_name.png) + +So in the Human genome, ARP2 refers to multiple genes through the alias names. +When we searched for gene sets with [`escape`], we did not perform any +disambiguation. + +The same with _arabidopsis thaliana_, ARP2 can refer to actin related protein, +or a root development protein. + +In _mus musculus_, ARP2 refers to an actin related protein on chromosome 11, or +a subunit adaptor protein on chromosome 5. + +We now understand that `escape` analysis may be completely wrong since it used +human-intelligible gene names. These names include synonyms, homonyms, within +a single organism, or between multiple ones. + +We need to use unique identifiers. These identifiers are called gene identifiers +and we usually consider [`EnsemblID`](https://ftp.ensembl.org/pub/release-60/gtf/mus_musculus/) +or [`EntrezID`](https://www.ncbi.nlm.nih.gov/Web/Search/entrezfs.html). + +For example: + +1. ARP2 from chromosome 11 equals [`ENSMUSG00000020152`](http://www.ensembl.org/Mus_musculus/Gene/Summary?g=ENSMUSG00000020152;r=11:20012304-20062913) at Ensembl. +1. ARP2 from chromosome 5 equals [`ENSMUSG00000019518`](http://www.ensembl.org/Mus_musculus/Gene/Summary?g=ENSMUSG00000019518;r=5:138170264-138178691) at Ensembl. + +These identifiers are very unpleasant to use for human beings. On monday meeting, +please talk about `ARP2` and not `ENSMUSG00000020152`. However, when machines +are working, please given them `EnsemblID` or `EntrezID`. + +Let's translate all gene identifiers and evaluate possible number of errors +in escape analysis. + +First, let's load the DEA results based on Wilcoxon analysis: + +```{r get_de_gene_names, eval=TRUE, echo=TRUE} +# We load our differential expression analysis result +# using the function `readRDS` from `base` package +sobj_wilcox <- base::readRDS(file = "sobj_wilcox.RDS") +# Copy gene names in a column it makes the future operation easier +sobj_wilcox$SYMBOL <- base::rownames(sobj_wilcox) +``` + +Then, we use the function [`bitr`](https://rdrr.io/bioc/clusterProfiler/man/bitr.html) +(standing for _biological translator_) from the package [`clusterProfiler`](https://bioconductor.org/packages/release/bioc/html/clusterProfiler.html). + +```{r cp_bitr_wilcox, eval=TRUE, echo=TRUE} +# We store the result of `bitr` function +# from `clusterProfiler` package +# in a variable called `annotation` +annotation <- clusterProfiler::bitr( + # We provide the variable pointing to gene names + geneID = base::rownames(sobj_wilcox), + # We tell the translator to which gene identifiers + # translation must be done + toType = base::c("ENTREZID", "ENSEMBL"), + # We tell the translator which type of gene name we have + fromType = "SYMBOL", + # We provide mmu database + OrgDb = org.Mm.eg.db +) +``` + +```{r disply_annotation, eval=TRUE, echo=FALSE} +DT::datatable(annotation) +``` + +Now we would like to have these annotation alongside with adjusted pvalues +and fold change information. In order to do so, we use the function [`merge`](https://rdrr.io/r/base/merge.html) +from [`base`](https://rdrr.io/r/) package. Not from [`Seurat`](https://rdrr.io/github/atakanekiz/Seurat3.0/man/merge.Seurat.html) +package ! This is used to merger Seurat objects, but we have dataframes here! + +```{r merge_annotation_wilcox, eval=TRUE, echo=TRUE} +# We filter these results on **ADJUSTED** pvalue +sobj_wilcox <- sobj_wilcox[sobj_wilcox$p_val_adj <= 0.05, ] +# Use the function `merge` from package `base` in order +# to add annotation to wixocon DEA result. +sobj_wilcox <- base::merge( + # Variable pointing to the wilcoxon results + x = sobj_wilcox, + # Variable pointing to the annotation results + y = annotation, + # We tell how to merge sobj_wilcox + by.x = "SYMBOL", + # We tell how to merge annotation + by.y = "SYMBOL" +) +``` + +```{r display_sobj_wilcox, eval=TRUE, echo=FALSE} +DT::datatable(sobj_wilcox) +``` + +## Restricted sed of genes + +As we perform the analysis, we are going to provide a numeric +variable to the GSEA/Enrichment. + +We have the following columns: + +1. `p_val`: Ignore this column. Always ignore raw p-values. Look at corrected ones, and if they are missing, then compute them. +1. `avg_log2FC`: Average Log2(FoldChange). Illustrates how much a gene is differentially expessed between samples in each condition. +1. `pct.1`: Percent of cells with gene expression in condition one, here in "G1" phase. +1. `pct.2`: Percent of cells with gene expression in condition two, here in "S" phase. +1. `p_val_adj`: The confidence we have in the result. The closer to 0, the lesser is the risk of error. + +Is it a good idea to use `p_val` ? What are the consequences ? + +
+ +Answer + +No. Never. Never ever use raw P-Value. It is never a good idea. + +
+
+ +Is it a good idea to use `avg_log2FC` ? What are the consequences ? + +
+ +Answer + +It is a good idea, we could answer biological questions like : "Considering +differentially expressed genes, what are the pathways with highest expression +change ?" + +
+
+ +Is it a good idea to use `pct.1` ? What are the consequences ? + +
+ +Answer + +It is a good idea, we could answer biological questions like : "Considering +differentially expressed genes, what are the expression of genes in pathway +XXX in the first condition ?" + +
+
+ +Is it a good idea to use `pct.2` ? What are the consequences ? + +
+ +Answer + +It is a good idea, we could answer biological questions like : "Considering +differentially expressed genes, what are the expression of genes in pathway +XXX in the second condition ?" + +
+
+ +Is it a good idea to use `p_val_adj` ? What are the consequences ? + +
+ +Answer + +It is a good idea, we could answer biological questions like : "Which pathways +are differentially expressed with highest confidence interval ?" + +But in order to perform surch test, use `-log10(Adjusted P-Value)` instead of +the raw adjusted p-value. Remember, 0 is a good confidence interval, and 1 a +bad one. So we need the values close to 0 to be highly positive. + +
+
+ +## Enrichment vs GSEA + +Previously, we used a function called `enrichIt`. We talked about enrichment +analysis, yet this talk is called GSEA. + +This is due to the fact that both techniques exists and are different. + +Enrichment tests a list of genes. Their expression, confidence +interval, etc. Nothing else matters than their name. + +For each cell, `enrichIt` removes zeros and tests the list of remaining genes. +Their expression does not matter. Their order in the gene list does not matter. +Their impact on the pathway does not matter. Their differential expression +status does not matter. + +Behind the scenes, it's a very simple tests answering the following question: +"Among expressed genes, what are the odds that it belongs to the pathway XXX?" + +We can do enrichment tests on our wilxoc results, using the function [`enrichGO`](https://rdrr.io/bioc/clusterProfiler/man/enrichGO.html) +from the package [`clusterProfiler`](https://rdrr.io/bioc/clusterProfiler/): + +```{r clusterprofiler_enrichment, eval=TRUE, echo=TRUE} +# We store the result of the function `enrichGO` from package `clusterProfiler` +# in the function `enrich_wilcox` +enrich_wilcox <- clusterProfiler::enrichGO( + # We provide the list of gene identifiers to test + gene = sobj_wilcox$ENTREZID, + # We provide the complete list of genes we quantified + universe = annotation$ENTREZID, + # We provide mouse annotations + OrgDb = org.Mm.eg.db, + # We tell the function that we use entrez-identifiers + keyType = "ENTREZID", + # We search results on Biological Process" + ont = "BP", + # We want all the results + pvalueCutoff = 1, + # We are humans, we wan human-readable gene names + readable = TRUE +) +``` + +```{r display_enrich_wilcox, eval=TRUE, echo=FALSE} +DT::datatable(enrich_wilcox@result) +``` + +We can have a look at pathways including the word `G2M`, using the functions +[`with`](https://rdrr.io/r/base/with.html) and [`grepl`](https://rdrr.io/r/base/grep.html) +from `base` package. + +```{r grep_g2m_enrich_cp, eval=TRUE, echo=TRUE} +# Get the result table +results_enrich_wilcox <- enrich_wilcox@result + +# We store the result of the line selection +# in a variable called `g2m_enrich` +g2m_enrich <- results_enrich_wilcox[ + # We select rows containing a given information + # using the function `with` from `base` package + base::with( + # We provide the variable pointing to enrichment results + data = results_enrich_wilcox, + # We provide the term we are searching and the column in which + # the term should be searched + base::grepl("G2M", Description) + ), # Do not forget his comma, since we are filtering a dataframe on rows +] +``` + +```{r display_enrich_G2M_wilcox, eval=TRUE, echo=FALSE} +DT::datatable(g2m_enrich) +``` + +GSEA tests the gene names, and uses the numeric value associated with +that gene in order to weight the results. It tests the name, the rank, +and the numeric value. + +First, we need to create a named list of gene weights. For this example, +we use the average fold change as weights. + +```{r gseago_wilcox_prepare_gene_list, eval=TRUE, echo=TRUE} +# We extract gene average fold change +gene_list <- sobj_wilcox$avg_log2FC +# We extract genes Entrez ID +base::names(gene_list) <- sobj_wilcox$ENTREZID +# We sort that list deacreasingly +gene_list <- base::sort(gene_list, decreasing=TRUE) + +# Check the results with the function `head` from `utils` package +utils::head(gene_list) +``` + +And now, we can run the [`gseGO`](https://rdrr.io/bioc/clusterProfiler/man/gseGO.html) +from [`clusterProfiler`](https://rdrr.io/bioc/clusterProfiler/). You'll see, +the interfacce is almost the same as for the enrichment: + +```{r gseago_wilcox_run, eval=TRUE, echo=TRUE} +# We use the function `gseGO` from the package `clusterProfiler` +# and store the result in a variable called `gsea_wilcox` +gsea_wilcox <- clusterProfiler::gseGO( + # We provide the variable pointing to named gene list + geneList = gene_list, + # We provide mouse annotations + OrgDb = org.Mm.eg.db, + # We tell the function that we use entrez-identifiers + keyType = "ENTREZID", + # We search results on Biological Process" + ont = "BP", + # We want all the results + pvalueCutoff = 1 +) +``` + +```{r display_gsea_wilcox, eval=TRUE, echo=FALSE} +DT::datatable(gsea_wilcox@result) +``` + +Just like before, we can have a look at pathways including the word +`G2M`, using the functions [`with`](https://rdrr.io/r/base/with.html) +and [`grepl`](https://rdrr.io/r/base/grep.html) from `base` package. + +```{r grep_g2m_gsego_cp, eval=TRUE, echo=TRUE} +# Get the result table +results_gse_wilcox <- enrich_wilcox@result + +# We store the result of the line selection +# in a variable called `g2m_enrich` +g2m_gse <- results_gse_wilcox[ + # We select rows containing a given information + # using the function `with` from `base` package + base::with( + # We provide the variable pointing to enrichment results + data = results_gse_wilcox, + # We provide the term we are searching and the column in which + # the term should be searched + base::grepl("G2/M", Description) + ), # Do not forget his comma, since we are filtering a dataframe on rows ! +] + +# Check the results with the function `head` from `utils` package +utils::head(g2m_gse) +``` + +```{r display_gse_G2M_wilcox, eval=TRUE, echo=FALSE} +DT::datatable(g2m_gse) +``` + +## Visualize results + +This makes a lot of tables. Let's make a lot of graphs ! + +We can use the function [`barplot`](https://rdrr.io/bioc/enrichplot/man/barplot.enrichResult.html) +from package [`graphics`](https://rdrr.io/bioc/graphics/) ; and not the ones +from any other package, or it won't work ! + +```{r barplot_ego, eval=TRUE, echo=TRUE, fig.height=10} +# We use the function `barplot` from package `enrichplot` +graphics::barplot( + # We provide the variable pointing to enrichment results + height = enrich_wilcox, + # We display the best 15 results + howCategory=15 +) +``` + +We can also use the function [`ditplot`](https://rdrr.io/bioc/enrichplot/man/barplot.enrichResult.html) +from package [`enrichplot`](https://rdrr.io/bioc/enrichplot/) ; and not the ones +from any other package, or it won't work ! Note the change in the input +parameter name: + +```{r dotplot_ego, eval=TRUE, echo=TRUE, fig.height=10} +# We use the function `barplot` from package `enrichplot` +enrichplot::dotplot( + # We provide the variable pointing to enrichment results + object = enrich_wilcox, + # We display the best 15 results + showCategory=15 +) +``` + +We can display traditional GSEA graph using [`gseaplot2`](https://rdrr.io/bioc/enrichplot/man/gseaplot2.html) +from package [`enrichplot`](https://rdrr.io/bioc/enrichplot/): + +```{r gseaplot, eval=TRUE, echo=TRUE} +# We use the function `barplot` from package `enrichplot` +enrichplot::gseaplot2( + # We provide the variable pointing to enrichment results + x = gsea_wilcox, + # We display the best result + geneSetID = 1 +) +``` + +With GSEA, you dot not test if a pathway is up or down regulated. +A pathway contains both enhancers and suppressors genes. An +up-regulation of enhancer genes and a down-regulation of suppressor genes +will lead to a “bad” enrichment score. However, this will lead to a strong +change in your pathway activity! + +If your favorite pathway does not have a “good enrichment score”, it does +not mean that pathway is not affected. + +The [`heatplot`](https://rdrr.io/bioc/enrichplot/man/heatplot.html) displays +both a heatmap of enriched pathways *and* their genes in commons: + +```{r heatplot_gsea, echo=TRUE, eval=TRUE, fig.height=10} +# We use the function `heatplot` from `enrichplot` package +enrichplot::heatplot( + # We probide the variable pointing to GSEA results + x = gsea_wilcox, + # We show the 15 best results + showCategory = 15, + # We color according to FoldChange + foldChange = gene_list +) +``` + +The genes in common between multiple gene sets are also visible through an +uspet plot: + +```{r upset_gsea, echo=TRUE, eval=TRUE, fig.height=10} +# We use the function `upsetplot` from `enrichplot` package +enrichplot::upsetplot( + # We probide the variable pointing to GSEA results + x = gsea_wilcox, + # We show the 10 best results + n = 10 +) +``` + +Finally, we can display whole pathways using KEGG database: + +```{r pathview_gene_list, eval=FALSE, echo=TRUE} +# We use the function pathview from pathview package +pathview::pathview( + gene.data = gene_list, # Our gene list + pathway.id = "mmu04110", # Our pathway + species = "mmu", # Our organism + # The color limits + limit = list(gene=max(abs(gene_list))), + gene.idtype = "ENTREZID" # The genes identifiers +) +``` + +![pathview_results](mmu04110.pathview.png) + +## Session Info + +This list of all packages used while you work should be included +in each en every R presentation: + +```{r session_info, eval=TRUE, echo=TRUE} +utils::sessionInfo() +``` \ No newline at end of file diff --git a/2024/evaiin1/SingleCell/DEA_GSEA/SingleCell_GSEA.html b/2024/evaiin1/SingleCell/DEA_GSEA/SingleCell_GSEA.html new file mode 100644 index 0000000..b34cce2 --- /dev/null +++ b/2024/evaiin1/SingleCell/DEA_GSEA/SingleCell_GSEA.html @@ -0,0 +1,5257 @@ + + + + + + + + + + + + + + + ¶ EBAII n1 2024Gene Set Enrichment Analysis ¶ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+
+
+
+
+ +
+ + + + + + + + + + + + + +
+

1 Forewords

+
+

1.1 TLDR: R command +lines

+

In this presentation, there will be screen captures for you to follow +the lesson. There will also be every single R command lines. Do not take +care of the command lines if you find them too challenging. Our goal +here, is to understand the main mechanism of Differential Expression +Analysis. R is just a tool.

+

Below are the libraries we need to perform this whole session:

+
base::library(package = "BiocParallel")    # Optionally multithread some steps
+base::library(package = "DT")              # Display nice table in HTML
+base::library(package = "ggplot2")         # Draw graphs and plots
+base::library(package = "ggpubr")          # Draw nicer graphs
+base::library(package = "rstatix")         # Base R statistics
+base::library(package = "knitr")           # Build this presentation
+base::library(package = "dplyr")           # Handle big tables
+base::library(package = "Seurat")          # Handle SingleCell analyses
+base::library(package = "SeuratObject")    # Handle SingleCell objects
+base::library(package = "SingleCellExperiment") # Handle SingleCell file formats
+base::library(package = "escape")          # Perform exploratory enrichment analysis
+base::library(package = "clusterProfiler") # Perform GSEA analysis
+base::library(package = "dittoSeq")        # Draw nice plots based on Seurat
+base::library(package = "org.Mm.eg.db")    # Mouse genome annotation
+base::library(package = "Cairo")           # Graphs library
+base::library(package = "pathview")        # For the whole pathway graph
+

First, we load Seurat object:

+
sobj <- base::readRDS(
+  # Path to the RDS file
+  file = "DEA_Scaled_Normalized_Filtered.RDS"
+)
+

Then we launch enrichment exploration on all counts:

+
# Acquire gene sets
+mh_hallmark <- escape::getGeneSets(
+  species = "Mus musculus",
+  library = "H"
+)
+
+# Run enrichment
+enrichment_matrix <- escape::escape.matrix(
+  input.data = sobj,
+  gene.sets = mh_hallmark,
+  method = "ssGSEA",
+  groups =  1000,
+  min.size = 5,
+  BPPARAM = BiocParallel::SnowParam(workers = 2),
+)
+
+# Save enrichment in seurat object
+sobj[["escape"]] <- SeuratObject::CreateAssay5Object(
+  data=t(enrichment_matrix),
+)
+
+saveRDS(sobj, file = "sobj_GSEA.RDS")
+
+
+

1.2 Purpose of this +session

+

Up to now, we have:

+
    +
  1. Identified to which cell each sequenced reads come from
  2. +
  3. Identified to which gene each read come from
  4. +
  5. Identified possible bias in gene expression for each cell
  6. +
  7. Filtered and corrected these bias as well as we can
  8. +
  9. Found differentially expressed genes across multiple conditions
  10. +
  11. Annotated cell clusters
  12. +
+

We would like to identify the functions of genes among several +clusters, or differentially expressed genes.

+

At the end of this session you will know:

+
    +
  1. What is gene set analysis
  2. +
  3. How to choose a Gene Set database
  4. +
  5. How to perform an enrichment analysis
  6. +
  7. How to read Gene set analysis results
  8. +
+
+
+
+

2 Select a database

+
+

2.1 Gene: Jund

+

Let’s search information about this gene on the web. For mice, one of +the best web-site for human is: MGI.

+

If we search for the gene Plac8, we +find the following:

+
+ +

plac8_mgi

+
+

As we click on “Phenotype reference”, we can see that the MGI +database can give us PubMeb +IDs related to the gene Plca8. Below, on the same page, we can see +regulation pathways, protein ontology, gene interactions, etc.

+
+ +

plca8_pathway

+
+

We illustrate here, that GSEA does not only include gene-to-function, +or gene-to-pathway. They usually include many more information.

+
+
+

2.2 Database types

+
+ +

db_types

+
+

There is a database for pretty much everything you might want. From +pharmacology, to regulatory objects, etc.

+

Our dataset contains gene expression. But the same methods can be +applied to other kind of datasets. E.g.

+
+ +

ppis

+
+

Protein-protein interactions, drug interactions, SNP-interactions, +etc.

+

Within R, you may find genome databases for a wide range of +organisms:

+
+ +

org_bd

+
+
+
+

2.3 Some noticeable +databases

+
+ +

For this session, we are going to use MSigBD.

+
+
+

2.4 How to perform +GSEA

+

There are two main types of Gene Set Enrichment Analysis: 1. A blind +search among all genes in order to identify pathways or mechanisms among +cells. 1. A specific search through genes of interest

+

The first is called “Exploratory” analysis and starts on the +Seurat object.

+

Many packages perform this kind of exploratory analysis, I chose to +present escape +for its simple and intuitive usage.

+

What really matters is the GSEA method used in back-end. There are +several very well known methods: fgsea, +gsva, +DOSE, +and the MSigDB’s +tool. While choosing your favorite R package to perform GSEA, if that +package uses one of these methods, then you should not have odd +questions through your publication process.

+

These methods are very alike, based on the same statistical resorts, +and suffer for the same limitations. Let’s illustrate it with +examples.

+
+
+
+

3 Exploratory +analysis

+
+

3.1 Gene sets

+

Up to now, we already define what’s a gene set: a group of +genes of interest. We also know these king of relationships are stored +in GMT +files or TSV +files.

+

Since the beginning of our week, we’ve been working with R, and the +question naturally comes: how to make/use gene sets in R?

+
+

3.1.1 Create your own +gene set

+

Very easily. We need a “named list +of vectors”, +all coming from base R package.

+
# We use the function list from base package
+pathways <- base::list(
+  # We use the function `c` from base package
+  proliferation = base::c(
+    "Ifng", "Kras", "Mgat5",    
+    "Prf1", "Trem1",    "Trp53"
+  ),
+  growth = base::c(
+    "Cdkn2a", "Fos", "Ifnb1",
+    "Rras", "Apc", "Sell"
+  )
+)
+utils::head(pathways)
+
$proliferation
+[1] "Ifng"  "Kras"  "Mgat5" "Prf1"  "Trem1" "Trp53"
+
+$growth
+[1] "Cdkn2a" "Fos"    "Ifnb1"  "Rras"   "Apc"    "Sell"  
+

This method is very useful when you want to compare your cells toward +unpublished data, gene sets from an article not sourced in official +databases, etc.

+
+
+

3.1.2 Load a gene set +from disk

+

We can also load a GMT file from a file. This is very usefull when we +expect both known and new pathways to be tested, or known pathways that +should be updated.

+

This is done with the function read.gmt +from the clusterProfiler +package.

+
pathways <- clusterProfiler::read.gmt(
+  gmtfile = "m5.mpt.v2023.1.Mm.symbols.gmt"
+)
+pathways <- BiocGenerics::lapply(
+  S4Vectors::split(
+    pathways[-1], pathways$term
+  ),
+  function(x) x[x != ""]
+)
+utils::head(pathways)
+
$MP_ABNORMAL_TUMOR_VASCULARIZATION
+ [1] "Adamts12" "Aggf1"    "Amotl1"   "Anxa1"    "Arhgef4"  "Cav2"    
+ [7] "Ccm2l"    "Cd82"     "Dock4"    "Fkbpl"    "Gpr4"     "H2ax"    
+[13] "Idh2"     "Ifngr1"   "Il12rb2"  "Il1a"     "Il1b"     "Itga6"   
+[19] "Itgb3"    "Jcad"     "Mmrn2"    "Myct1"    "Ntn4"     "Peak1"   
+[25] "Pld1"     "Rhoj"     "Rras"     "S1pr2"    "Stard13" 
+
+$MP_DECREASED_CIRCULATING_TUMOR_NECROSIS_FACTOR_LEVEL
+ [1] "Adam17"   "Apba3"    "Arid5a"   "Bbs12"    "C3"       "Cd14"    
+ [7] "Cd19"     "Cd44"     "Clic4"    "Enpp2"    "F3"       "Ffar2"   
+[13] "Foxn1"    "Foxo1"    "Gba"      "Hectd3"   "Ifi204"   "Ifi35"   
+[19] "Ifit2"    "Ifngr1"   "Ikbkb"    "Il22"     "Il6ra"    "Irak2"   
+[25] "Irak4"    "Irf5"     "Lacc1"    "Lep"      "Lrrc19"   "Lrrc8e"  
+[31] "Mapkapk2" "Mbl1"     "Mcpt1"    "Mif"      "Msmp"     "Mstn"    
+[37] "Myd88"    "Nfkbib"   "Nkg7"     "Nmi"      "Npy1r"    "Parp1"   
+[43] "Peli1"    "Pilrb1"   "Plekhf1"  "Prkce"    "Rhbdf2"   "Sgms1"   
+[49] "Siglec1"  "Snx8"     "Sra1"     "Thbd"     "Ticam1"   "Tlr2"    
+[55] "Tnf"      "Tradd"    "Zdhhc1"  
+
+$MP_DECREASED_INCIDENCE_OF_INDUCED_TUMORS
+ [1] "Ccng1"   "Cd151"   "Cdk1"    "Cebpb"   "Ct55"    "Cyp2c23" "Eif2s2" 
+ [8] "H2-Ob"   "Il6"     "Klk6"    "Krt10"   "Lrig2"   "Pgr"     "Terc"   
+[15] "Uqcc3"   "Zmiz1"   "a"      
+
+$MP_DECREASED_INCIDENCE_OF_TUMORS_BY_CHEMICAL_INDUCTION
+ [1] "Agtr2"     "Ahrr"      "Ar"        "Arl6ip5"   "Birc5"     "Brca2"    
+ [7] "Cacfd1"    "Camk2g"    "Ccl2"      "Cd151"     "Cd34"      "Cd96"     
+[13] "Cdkn1b"    "Cdkn2a"    "Cebpb"     "Chd1l"     "Cyp1a2"    "Cyp1b1"   
+[19] "Cyp2e1"    "Dek"       "Dnajc27"   "Endov"     "Ephx1"     "Fgf22"    
+[25] "Fntb"      "Foxm1"     "Foxn1"     "Fxyd5"     "Fzr1"      "Gata6os"  
+[31] "Hcst"      "Hipk1"     "Hras"      "Ier3"      "Il12b"     "Il1r1"    
+[37] "Il23a"     "Il6"       "Il6ra"     "Itgb1"     "Kdm4c"     "Klk6"     
+[43] "Lonp1"     "Loxl2"     "Lypd3"     "Mapkapk2"  "Mgat3"     "Mif"      
+[49] "Mir21a"    "Mir301"    "Mki67"     "Mmp11"     "Mmp19"     "Mmp1a"    
+[55] "Mpg"       "Mtdh"      "Muc4"      "Myd88"     "Neu3"      "Pard3"    
+[61] "Pik3ca"    "Pla2g4a"   "Plce1"     "Prdx1"     "Pten"      "Ptger1"   
+[67] "Ptger2"    "Ptger4"    "Ptgs2"     "Ptk2"      "Ptp4a3"    "Ptprj"    
+[73] "Pvr"       "Pycard"    "Ralgds"    "Retnlb"    "Skil"      "Slc3a2"   
+[79] "Spp1"      "Stag1"     "Stat3"     "Strap"     "Terc"      "Tert"     
+[85] "Tiam1"     "Tlr4"      "Tnf"       "Tnfaip8l3" "Tnik"      "Trem1"    
+[91] "Trim27"    "Trp53"     "Uri1"     
+
+$MP_DECREASED_LIVER_TUMOR_INCIDENCE
+[1] "Foxm1" "Mtdh"  "Myd88" "Tert"  "Tlr2" 
+
+$MP_DECREASED_LUNG_TUMOR_INCIDENCE
+[1] "Ggta1" "Kras"  "Lexm"  "Met"   "Mtdh"  "Tlr2" 
+
+
+

3.1.3 Load a gene set +from the web

+

We can also get pathways from the web using the function getGeneSets +from package escape.

+
# Pay attention to the genome name, it's case sensitive
+# and follows MSigDB naming scheme
+mh_hallmark <- escape::getGeneSets(
+  species = "Mus musculus", 
+  library = "H"
+)
+utils::head(pathways)
+
$MP_ABNORMAL_TUMOR_VASCULARIZATION
+ [1] "Adamts12" "Aggf1"    "Amotl1"   "Anxa1"    "Arhgef4"  "Cav2"    
+ [7] "Ccm2l"    "Cd82"     "Dock4"    "Fkbpl"    "Gpr4"     "H2ax"    
+[13] "Idh2"     "Ifngr1"   "Il12rb2"  "Il1a"     "Il1b"     "Itga6"   
+[19] "Itgb3"    "Jcad"     "Mmrn2"    "Myct1"    "Ntn4"     "Peak1"   
+[25] "Pld1"     "Rhoj"     "Rras"     "S1pr2"    "Stard13" 
+
+$MP_DECREASED_CIRCULATING_TUMOR_NECROSIS_FACTOR_LEVEL
+ [1] "Adam17"   "Apba3"    "Arid5a"   "Bbs12"    "C3"       "Cd14"    
+ [7] "Cd19"     "Cd44"     "Clic4"    "Enpp2"    "F3"       "Ffar2"   
+[13] "Foxn1"    "Foxo1"    "Gba"      "Hectd3"   "Ifi204"   "Ifi35"   
+[19] "Ifit2"    "Ifngr1"   "Ikbkb"    "Il22"     "Il6ra"    "Irak2"   
+[25] "Irak4"    "Irf5"     "Lacc1"    "Lep"      "Lrrc19"   "Lrrc8e"  
+[31] "Mapkapk2" "Mbl1"     "Mcpt1"    "Mif"      "Msmp"     "Mstn"    
+[37] "Myd88"    "Nfkbib"   "Nkg7"     "Nmi"      "Npy1r"    "Parp1"   
+[43] "Peli1"    "Pilrb1"   "Plekhf1"  "Prkce"    "Rhbdf2"   "Sgms1"   
+[49] "Siglec1"  "Snx8"     "Sra1"     "Thbd"     "Ticam1"   "Tlr2"    
+[55] "Tnf"      "Tradd"    "Zdhhc1"  
+
+$MP_DECREASED_INCIDENCE_OF_INDUCED_TUMORS
+ [1] "Ccng1"   "Cd151"   "Cdk1"    "Cebpb"   "Ct55"    "Cyp2c23" "Eif2s2" 
+ [8] "H2-Ob"   "Il6"     "Klk6"    "Krt10"   "Lrig2"   "Pgr"     "Terc"   
+[15] "Uqcc3"   "Zmiz1"   "a"      
+
+$MP_DECREASED_INCIDENCE_OF_TUMORS_BY_CHEMICAL_INDUCTION
+ [1] "Agtr2"     "Ahrr"      "Ar"        "Arl6ip5"   "Birc5"     "Brca2"    
+ [7] "Cacfd1"    "Camk2g"    "Ccl2"      "Cd151"     "Cd34"      "Cd96"     
+[13] "Cdkn1b"    "Cdkn2a"    "Cebpb"     "Chd1l"     "Cyp1a2"    "Cyp1b1"   
+[19] "Cyp2e1"    "Dek"       "Dnajc27"   "Endov"     "Ephx1"     "Fgf22"    
+[25] "Fntb"      "Foxm1"     "Foxn1"     "Fxyd5"     "Fzr1"      "Gata6os"  
+[31] "Hcst"      "Hipk1"     "Hras"      "Ier3"      "Il12b"     "Il1r1"    
+[37] "Il23a"     "Il6"       "Il6ra"     "Itgb1"     "Kdm4c"     "Klk6"     
+[43] "Lonp1"     "Loxl2"     "Lypd3"     "Mapkapk2"  "Mgat3"     "Mif"      
+[49] "Mir21a"    "Mir301"    "Mki67"     "Mmp11"     "Mmp19"     "Mmp1a"    
+[55] "Mpg"       "Mtdh"      "Muc4"      "Myd88"     "Neu3"      "Pard3"    
+[61] "Pik3ca"    "Pla2g4a"   "Plce1"     "Prdx1"     "Pten"      "Ptger1"   
+[67] "Ptger2"    "Ptger4"    "Ptgs2"     "Ptk2"      "Ptp4a3"    "Ptprj"    
+[73] "Pvr"       "Pycard"    "Ralgds"    "Retnlb"    "Skil"      "Slc3a2"   
+[79] "Spp1"      "Stag1"     "Stat3"     "Strap"     "Terc"      "Tert"     
+[85] "Tiam1"     "Tlr4"      "Tnf"       "Tnfaip8l3" "Tnik"      "Trem1"    
+[91] "Trim27"    "Trp53"     "Uri1"     
+
+$MP_DECREASED_LIVER_TUMOR_INCIDENCE
+[1] "Foxm1" "Mtdh"  "Myd88" "Tert"  "Tlr2" 
+
+$MP_DECREASED_LUNG_TUMOR_INCIDENCE
+[1] "Ggta1" "Kras"  "Lexm"  "Met"   "Mtdh"  "Tlr2" 
+
+
+
+

3.2 Run exploratory +analysis

+

Let’s time to practice. Use the function escape.matrix +from the package escape in order to +perform GSEA on your Seurat object. Perform +the analysis on cell cycle phase if possible. Save the results in a +variable, for example sobj_enrich.

+
+ +Answer + +

It is not possible to specify what group we are considering. +GSEA/Enrichment analysis do not perform any differential analysis.

+
# Save in a variable called `sobj_enrich`
+# the result of the function `runEscape` from package `escape`
+enrichment_matrix <- escape::escape.matrix(
+  # Out Seurat object
+  input.data = sobj,
+  # Provide gene sets
+  gene.sets = mh_hallmark,
+  # Select your favorite method
+  method = "ssGSEA",
+  # ssGSEA parameters
+  groups =  1000,
+  min.size = 5,
+  # Use 2 threads/CPU/workers
+  BPPARAM = BiocParallel::SnowParam(workers = 2),
+)
+
[1] "Using sets of 1000 cells. Running 4 times."
+[1] "Calculating ranks..."
+[1] "Calculating absolute values from ranks..."
+[1] "Calculating ranks..."
+[1] "Calculating absolute values from ranks..."
+[1] "Calculating ranks..."
+[1] "Calculating absolute values from ranks..."
+[1] "Calculating ranks..."
+[1] "Calculating absolute values from ranks..."
+

If you have reserved more than one core, this function can be +multi-threaded, making it faster !

+
+


+
+
+

3.3 Visualize +results

+

We can visualise the gene sets enriched in cell clusters using the +function FeaturePlot +from Seurat package. We just need to color the cells with +ssGSEA’s results:

+
Seurat::FeaturePlot(object = sobj, features = "HALLMARK-DNA-REPAIR")
+

+
+
+
+

4 On purpose +analysis

+

In order to understand what happened, where do these results come +from, we need to go step by step and perform both enrichment and GSE +analysis manually.

+

In general, the exploratory method is never used on a raw Seurat +object, we usually filter this object in order to search pathways that +are related to our biological question.

+

With the previous method, what ever our biological question was, the +cell cycle phase was one of the top gene expression drivers.

+
+

4.1 Gene Names and Gene +identifiers

+

Let’s search information about the gene ARP2.

+
+ +

arp2_multiple

+
+

So, this gene exists in multiple organisms. Thanks, we know we are +working on mice, and we told gene set retriever function to consider +‘mouse’ datasets.

+
+ +

arp2_names

+
+

So in the Human genome, ARP2 refers to multiple genes through the +alias names. When we searched for gene sets with [escape], +we did not perform any disambiguation.

+

The same with arabidopsis thaliana, ARP2 can refer to actin +related protein, or a root development protein.

+

In mus musculus, ARP2 refers to an actin related protein on +chromosome 11, or a subunit adaptor protein on chromosome 5.

+

We now understand that escape analysis may be completely +wrong since it used human-intelligible gene names. These names include +synonyms, homonyms, within a single organism, or between multiple +ones.

+

We need to use unique identifiers. These identifiers are called gene +identifiers and we usually consider EnsemblID +or EntrezID.

+

For example:

+
    +
  1. ARP2 from chromosome 11 equals ENSMUSG00000020152 +at Ensembl.
  2. +
  3. ARP2 from chromosome 5 equals ENSMUSG00000019518 +at Ensembl.
  4. +
+

These identifiers are very unpleasant to use for human beings. On +monday meeting, please talk about ARP2 and not +ENSMUSG00000020152. However, when machines are working, +please given them EnsemblID or EntrezID.

+

Let’s translate all gene identifiers and evaluate possible number of +errors in escape analysis.

+

First, let’s load the DEA results based on Wilcoxon analysis:

+
# We load our differential expression analysis result
+# using the function `readRDS` from `base` package
+sobj_wilcox <- base::readRDS(file = "sobj_wilcox.RDS")
+# Copy gene names in a column it makes the future operation easier
+sobj_wilcox$SYMBOL <- base::rownames(sobj_wilcox)
+

Then, we use the function bitr +(standing for biological translator) from the package clusterProfiler.

+
# We store the result of `bitr` function
+# from `clusterProfiler` package
+# in a variable called `annotation`
+annotation <- clusterProfiler::bitr(
+  # We provide the variable pointing to gene names
+  geneID = base::rownames(sobj_wilcox),
+  # We tell the translator to which gene identifiers
+  # translation must be done
+  toType = base::c("ENTREZID", "ENSEMBL"),
+  # We tell the translator which type of gene name we have
+  fromType = "SYMBOL",
+  # We provide mmu database
+  OrgDb = org.Mm.eg.db
+)
+
+ +

Now we would like to have these annotation alongside with adjusted +pvalues and fold change information. In order to do so, we use the +function merge from base package. Not from Seurat +package ! This is used to merger Seurat objects, but we have dataframes +here!

+
# We filter these results on **ADJUSTED** pvalue
+sobj_wilcox <- sobj_wilcox[sobj_wilcox$p_val_adj <= 0.05, ]
+# Use the function `merge` from package `base` in order
+# to add annotation to wixocon DEA result.
+sobj_wilcox <- base::merge(
+  # Variable pointing to the wilcoxon results
+  x = sobj_wilcox,
+  # Variable pointing to the annotation results
+  y = annotation,
+  # We tell how to merge sobj_wilcox
+  by.x = "SYMBOL",
+  # We tell how to merge annotation
+  by.y = "SYMBOL"
+)
+
+ +
+
+

4.2 Restricted sed of +genes

+

As we perform the analysis, we are going to provide a numeric +variable to the GSEA/Enrichment.

+

We have the following columns:

+
    +
  1. p_val: Ignore this column. Always ignore raw p-values. +Look at corrected ones, and if they are missing, then compute them.
  2. +
  3. avg_log2FC: Average Log2(FoldChange). Illustrates how +much a gene is differentially expessed between samples in each +condition.
  4. +
  5. pct.1: Percent of cells with gene expression in +condition one, here in “G1” phase.
  6. +
  7. pct.2: Percent of cells with gene expression in +condition two, here in “S” phase.
  8. +
  9. p_val_adj: The confidence we have in the result. The +closer to 0, the lesser is the risk of error.
  10. +
+

Is it a good idea to use p_val ? What are the +consequences ?

+
+ +Answer + +

No. Never. Never ever use raw P-Value. It is never a good idea.

+
+


+

Is it a good idea to use avg_log2FC ? What are the +consequences ?

+
+ +Answer + +

It is a good idea, we could answer biological questions like : +“Considering differentially expressed genes, what are the pathways with +highest expression change ?”

+
+


+

Is it a good idea to use pct.1 ? What are the +consequences ?

+
+ +Answer + +

It is a good idea, we could answer biological questions like : +“Considering differentially expressed genes, what are the expression of +genes in pathway XXX in the first condition ?”

+
+


+

Is it a good idea to use pct.2 ? What are the +consequences ?

+
+ +Answer + +

It is a good idea, we could answer biological questions like : +“Considering differentially expressed genes, what are the expression of +genes in pathway XXX in the second condition ?”

+
+


+

Is it a good idea to use p_val_adj ? What are the +consequences ?

+
+ +Answer + +

It is a good idea, we could answer biological questions like : “Which +pathways are differentially expressed with highest confidence interval +?”

+

But in order to perform surch test, use +-log10(Adjusted P-Value) instead of the raw adjusted +p-value. Remember, 0 is a good confidence interval, and 1 a bad one. So +we need the values close to 0 to be highly positive.

+
+


+
+
+

4.3 Enrichment vs +GSEA

+

Previously, we used a function called enrichIt. We +talked about enrichment analysis, yet this talk is called GSEA.

+

This is due to the fact that both techniques exists and are +different.

+

Enrichment tests a list of genes. Their expression, confidence +interval, etc. Nothing else matters than their name.

+

For each cell, enrichIt removes zeros and tests the list +of remaining genes. Their expression does not matter. Their order in the +gene list does not matter. Their impact on the pathway does not matter. +Their differential expression status does not matter.

+

Behind the scenes, it’s a very simple tests answering the following +question: “Among expressed genes, what are the odds that it belongs to +the pathway XXX?”

+

We can do enrichment tests on our wilxoc results, using the function +enrichGO +from the package clusterProfiler:

+
# We store the result of the function `enrichGO` from package `clusterProfiler`
+# in the function `enrich_wilcox`
+enrich_wilcox <- clusterProfiler::enrichGO(
+  # We provide the list of gene identifiers to test
+  gene = sobj_wilcox$ENTREZID,
+  # We provide the complete list of genes we quantified
+  universe = annotation$ENTREZID,
+  # We provide mouse annotations
+  OrgDb = org.Mm.eg.db,
+  # We tell the function that we use entrez-identifiers
+  keyType = "ENTREZID",
+  # We search results on Biological Process"
+  ont = "BP",
+  # We want all the results
+  pvalueCutoff = 1,
+  # We are humans, we wan human-readable gene names
+  readable = TRUE
+)
+
+ +

We can have a look at pathways including the word G2M, +using the functions with and grepl from +base package.

+
# Get the result table
+results_enrich_wilcox <- enrich_wilcox@result
+
+# We store the result of the line selection
+# in a variable called `g2m_enrich`
+g2m_enrich <- results_enrich_wilcox[
+  # We select rows containing a given information
+  # using the function `with` from `base` package
+  base::with(
+    # We provide the variable pointing to enrichment results
+    data = results_enrich_wilcox,
+    # We provide the term we are searching and the column in which
+    # the term should be searched
+    base::grepl("G2M", Description)
+  ), # Do not forget his comma, since we are filtering a dataframe on rows
+]
+
+ +

GSEA tests the gene names, and uses the numeric value associated with +that gene in order to weight the results. It tests the name, the rank, +and the numeric value.

+

First, we need to create a named list of gene weights. For this +example, we use the average fold change as weights.

+
# We extract gene average fold change
+gene_list <- sobj_wilcox$avg_log2FC
+# We extract genes Entrez ID
+base::names(gene_list) <- sobj_wilcox$ENTREZID
+# We sort that list deacreasingly
+gene_list <- base::sort(gene_list, decreasing=TRUE)
+
+# Check the results with the function `head` from `utils` package
+utils::head(gene_list)
+
   12504    12259    12262    14960    16149    14969 
+8.970173 8.696339 8.573637 8.452598 8.323054 8.257669 
+

And now, we can run the gseGO +from clusterProfiler. +You’ll see, the interfacce is almost the same as for the enrichment:

+
# We use the function `gseGO` from the package `clusterProfiler`
+# and store the result in a variable called `gsea_wilcox`
+gsea_wilcox <- clusterProfiler::gseGO(
+  # We provide the variable pointing to named gene list
+  geneList = gene_list,
+  # We provide mouse annotations
+  OrgDb = org.Mm.eg.db,
+  # We tell the function that we use entrez-identifiers
+  keyType = "ENTREZID",
+  # We search results on Biological Process"
+  ont = "BP",
+  # We want all the results
+  pvalueCutoff = 1
+)
+
+ +

Just like before, we can have a look at pathways including the word +G2M, using the functions with and grepl from +base package.

+
# Get the result table
+results_gse_wilcox <- enrich_wilcox@result
+
+# We store the result of the line selection
+# in a variable called `g2m_enrich`
+g2m_gse <- results_gse_wilcox[
+  # We select rows containing a given information
+  # using the function `with` from `base` package
+  base::with(
+    # We provide the variable pointing to enrichment results
+    data = results_gse_wilcox,
+    # We provide the term we are searching and the column in which
+    # the term should be searched
+    base::grepl("G2/M", Description)
+  ), # Do not forget his comma, since we are filtering a dataframe on rows !
+]
+
+# Check the results with the function `head` from `utils` package
+utils::head(g2m_gse)
+
                   ID
+GO:1902751 GO:1902751
+GO:0010971 GO:0010971
+GO:1902749 GO:1902749
+GO:0044839 GO:0044839
+GO:0000086 GO:0000086
+GO:0010389 GO:0010389
+                                                            Description
+GO:1902751      positive regulation of cell cycle G2/M phase transition
+GO:0010971 positive regulation of G2/M transition of mitotic cell cycle
+GO:1902749               regulation of cell cycle G2/M phase transition
+GO:0044839                             cell cycle G2/M phase transition
+GO:0000086                        G2/M transition of mitotic cell cycle
+GO:0010389          regulation of G2/M transition of mitotic cell cycle
+           GeneRatio  BgRatio     pvalue  p.adjust    qvalue
+GO:1902751     3/394  23/9476 0.06825935 0.4259430 0.3891670
+GO:0010971     2/394  20/9476 0.20114319 0.6661140 0.6086015
+GO:1902749     5/394  90/9476 0.31953152 0.7945751 0.7259713
+GO:0044839     6/394 112/9476 0.32235419 0.7945751 0.7259713
+GO:0000086     5/394  97/9476 0.37773602 0.7945751 0.7259713
+GO:0010389     4/394  80/9476 0.42724795 0.7945751 0.7259713
+                                     geneID Count
+GO:1902751                   Cdk4/Mta3/Npm1     3
+GO:0010971                        Cdk4/Mta3     2
+GO:1902749       Babam2/Cdk4/Cdk6/Mta3/Npm1     5
+GO:0044839 Babam2/Calm1/Cdk4/Cdk6/Mta3/Npm1     6
+GO:0000086      Babam2/Calm1/Cdk4/Cdk6/Mta3     5
+GO:0010389            Babam2/Cdk4/Cdk6/Mta3     4
+
+ +
+
+

4.4 Visualize +results

+

This makes a lot of tables. Let’s make a lot of graphs !

+

We can use the function barplot +from package graphics ; and +not the ones from any other package, or it won’t work !

+
# We use the function `barplot` from package `enrichplot`
+graphics::barplot(
+  # We provide the variable pointing to enrichment results
+  height = enrich_wilcox,
+  # We display the best 15 results
+  howCategory=15
+)
+

+

We can also use the function ditplot +from package enrichplot ; +and not the ones from any other package, or it won’t work ! Note the +change in the input parameter name:

+
# We use the function `barplot` from package `enrichplot`
+enrichplot::dotplot(
+  # We provide the variable pointing to enrichment results
+  object = enrich_wilcox,
+  # We display the best 15 results
+  showCategory=15
+)
+

+

We can display traditional GSEA graph using gseaplot2 +from package enrichplot:

+
# We use the function `barplot` from package `enrichplot`
+enrichplot::gseaplot2(
+  # We provide the variable pointing to enrichment results
+  x = gsea_wilcox,
+  # We display the best result
+  geneSetID = 1
+)
+

+

With GSEA, you dot not test if a pathway is up or down regulated. A +pathway contains both enhancers and suppressors genes. An up-regulation +of enhancer genes and a down-regulation of suppressor genes will lead to +a “bad” enrichment score. However, this will lead to a strong change in +your pathway activity!

+

If your favorite pathway does not have a “good enrichment score”, it +does not mean that pathway is not affected.

+

The heatplot +displays both a heatmap of enriched pathways and their genes in +commons:

+
# We use the function `heatplot` from `enrichplot` package
+enrichplot::heatplot(
+  # We probide the variable pointing to GSEA results
+  x = gsea_wilcox,
+  # We show the 15 best results
+  showCategory = 15,
+  # We color according to FoldChange
+  foldChange = gene_list
+)
+

+

The genes in common between multiple gene sets are also visible +through an uspet plot:

+
# We use the function `upsetplot` from `enrichplot` package
+enrichplot::upsetplot(
+  # We probide the variable pointing to GSEA results
+  x = gsea_wilcox,
+  # We show the 10 best results
+  n = 10
+)
+

+

Finally, we can display whole pathways using KEGG database:

+
# We use the function pathview from pathview package
+pathview::pathview(
+  gene.data = gene_list, # Our gene list
+  pathway.id = "mmu04110", # Our pathway
+  species = "mmu", # Our organism
+  # The color limits
+  limit = list(gene=max(abs(gene_list))),
+  gene.idtype = "ENTREZID" # The genes identifiers
+)
+
+ +

pathview_results

+
+
+
+

4.5 Session Info

+

This list of all packages used while you work should be included in +each en every R presentation:

+
utils::sessionInfo()
+
R version 4.4.1 (2024-06-14)
+Platform: x86_64-conda-linux-gnu
+Running under: Ubuntu 20.04.6 LTS
+
+Matrix products: default
+BLAS/LAPACK: /shared/ifbstor1/software/miniconda/envs/r-4.4.1/lib/libopenblasp-r0.3.27.so;  LAPACK version 3.12.0
+
+locale:
+ [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
+ [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
+ [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
+ [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
+ [9] LC_ADDRESS=C               LC_TELEPHONE=C            
+[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
+
+time zone: Europe/Paris
+tzcode source: system (glibc)
+
+attached base packages:
+[1] stats4    stats     graphics  grDevices utils     datasets  methods  
+[8] base     
+
+other attached packages:
+ [1] pathview_1.44.0             Cairo_1.6-2                
+ [3] org.Mm.eg.db_3.19.1         AnnotationDbi_1.66.0       
+ [5] dittoSeq_1.16.0             clusterProfiler_4.12.6     
+ [7] escape_2.0.0                SingleCellExperiment_1.26.0
+ [9] SummarizedExperiment_1.34.0 Biobase_2.64.0             
+[11] GenomicRanges_1.56.2        GenomeInfoDb_1.40.1        
+[13] IRanges_2.38.1              S4Vectors_0.42.1           
+[15] BiocGenerics_0.50.0         MatrixGenerics_1.16.0      
+[17] matrixStats_1.4.1           Seurat_5.1.0               
+[19] SeuratObject_5.0.2          sp_2.1-4                   
+[21] dplyr_1.1.4                 knitr_1.48                 
+[23] rstatix_0.7.2               ggpubr_0.6.0               
+[25] ggplot2_3.5.1               DT_0.33                    
+[27] BiocParallel_1.38.0        
+
+loaded via a namespace (and not attached):
+  [1] SpatialExperiment_1.14.0  R.methodsS3_1.8.2        
+  [3] GSEABase_1.66.0           nnet_7.3-19              
+  [5] goftest_1.2-3             Biostrings_2.72.1        
+  [7] HDF5Array_1.32.0          vctrs_0.6.5              
+  [9] spatstat.random_3.3-2     digest_0.6.37            
+ [11] png_0.1-8                 ggrepel_0.9.6            
+ [13] deldir_2.0-4              parallelly_1.38.0        
+ [15] magick_2.8.5              MASS_7.3-61              
+ [17] reshape2_1.4.4            httpuv_1.6.15            
+ [19] qvalue_2.36.0             withr_3.0.1              
+ [21] xfun_0.48                 ggfun_0.1.6              
+ [23] survival_3.7-0            memoise_2.0.1            
+ [25] gson_0.1.0                tidytree_0.4.6           
+ [27] zoo_1.8-12                KEGGgraph_1.64.0         
+ [29] pbapply_1.7-2             ggdist_3.3.2             
+ [31] R.oo_1.26.0               Formula_1.2-5            
+ [33] KEGGREST_1.44.0           promises_1.3.0           
+ [35] httr_1.4.7                globals_0.16.3           
+ [37] fitdistrplus_1.2-1        rhdf5filters_1.16.0      
+ [39] rhdf5_2.48.0              rstudioapi_0.17.0        
+ [41] UCSC.utils_1.0.0          miniUI_0.1.1.1           
+ [43] generics_0.1.3            DOSE_3.30.1              
+ [45] base64enc_0.1-3           babelgene_22.9           
+ [47] zlibbioc_1.50.0           ScaledMatrix_1.12.0      
+ [49] ggraph_2.2.1              polyclip_1.10-7          
+ [51] GenomeInfoDbData_1.2.12   SparseArray_1.4.8        
+ [53] xtable_1.8-4              stringr_1.5.1            
+ [55] evaluate_1.0.1            S4Arrays_1.4.1           
+ [57] irlba_2.3.5.1             colorspace_2.1-1         
+ [59] ROCR_1.0-11               reticulate_1.39.0        
+ [61] spatstat.data_3.1-2       Rgraphviz_2.48.0         
+ [63] magrittr_2.0.3            lmtest_0.9-40            
+ [65] later_1.3.2               viridis_0.6.5            
+ [67] ggtree_3.12.0             lattice_0.22-6           
+ [69] spatstat.geom_3.3-3       future.apply_1.11.2      
+ [71] scattermore_1.2           XML_3.99-0.17            
+ [73] shadowtext_0.1.3          cowplot_1.1.3            
+ [75] ggupset_0.4.0             RcppAnnoy_0.0.22         
+ [77] Hmisc_5.1-3               pillar_1.9.0             
+ [79] nlme_3.1-165              compiler_4.4.1           
+ [81] beachmat_2.20.0           RSpectra_0.16-2          
+ [83] stringi_1.8.4             tensor_1.5               
+ [85] plyr_1.8.9                msigdbr_7.5.1            
+ [87] crayon_1.5.3              abind_1.4-8              
+ [89] gridGraphics_0.5-1        org.Hs.eg.db_3.19.1      
+ [91] graphlayouts_1.1.1        bit_4.5.0                
+ [93] fastmatch_1.1-4           codetools_0.2-20         
+ [95] BiocSingular_1.20.0       crosstalk_1.2.1          
+ [97] bslib_0.8.0               plotly_4.10.4            
+ [99] mime_0.12                 splines_4.4.1            
+[101] Rcpp_1.0.13               fastDummies_1.7.4        
+[103] sparseMatrixStats_1.16.0  HDO.db_0.99.1            
+[105] blob_1.2.4                utf8_1.2.4               
+[107] fs_1.6.4                  listenv_0.9.1            
+[109] checkmate_2.3.1           DelayedMatrixStats_1.26.0
+[111] GSVA_1.52.3               ggsignif_0.6.4           
+[113] ggplotify_0.1.2           tibble_3.2.1             
+[115] Matrix_1.7-1              tweenr_2.0.3             
+[117] pkgconfig_2.0.3           pheatmap_1.0.12          
+[119] tools_4.4.1               cachem_1.1.0             
+[121] RSQLite_2.3.7             viridisLite_0.4.2        
+[123] DBI_1.2.3                 fastmap_1.2.0            
+[125] rmarkdown_2.28            scales_1.3.0             
+[127] grid_4.4.1                ica_1.0-3                
+[129] broom_1.0.7               sass_0.4.9               
+[131] patchwork_1.3.0           dotCall64_1.2            
+[133] graph_1.82.0              carData_3.0-5            
+[135] RANN_2.6.2                rpart_4.1.23             
+[137] snow_0.4-4                farver_2.1.2             
+[139] tidygraph_1.3.1           scatterpie_0.2.4         
+[141] yaml_2.3.10               foreign_0.8-86           
+[143] cli_3.6.3                 purrr_1.0.2              
+[145] UCell_2.8.0               leiden_0.4.3.1           
+[147] lifecycle_1.0.4           uwot_0.2.2               
+[149] backports_1.5.0           annotate_1.82.0          
+[151] gtable_0.3.5              rjson_0.2.21             
+[153] ggridges_0.5.6            progressr_0.14.0         
+[155] parallel_4.4.1            ape_5.8                  
+[157] jsonlite_1.8.9            bitops_1.0-9             
+[159] RcppHNSW_0.6.0            bit64_4.5.2              
+[161] Rtsne_0.17                yulab.utils_0.1.7        
+[163] spatstat.utils_3.1-0      BiocNeighbors_1.22.0     
+[165] highr_0.11                jquerylib_0.1.4          
+[167] GOSemSim_2.30.2           spatstat.univar_3.0-1    
+[169] distributional_0.4.0      R.utils_2.12.3           
+[171] lazyeval_0.2.2            shiny_1.9.1              
+[173] htmltools_0.5.8.1         enrichplot_1.24.4        
+[175] GO.db_3.19.1              sctransform_0.4.1        
+[177] rappdirs_0.3.3            glue_1.8.0               
+[179] spam_2.11-0               httr2_1.0.1              
+[181] XVector_0.44.0            RCurl_1.98-1.14          
+[183] treeio_1.28.0             gridExtra_2.3            
+[185] AUCell_1.26.0             igraph_2.1.1             
+[187] R6_2.5.1                  tidyr_1.3.1              
+[189] labeling_0.4.3            ggpointdensity_0.1.0     
+[191] cluster_2.1.6             Rhdf5lib_1.26.0          
+[193] aplot_0.2.3               DelayedArray_0.30.1      
+[195] tidyselect_1.2.1          htmlTable_2.4.2          
+[197] ggforce_0.4.2             car_3.1-3                
+[199] future_1.34.0             rsvd_1.0.5               
+[201] munsell_0.5.1             KernSmooth_2.23-24       
+[203] data.table_1.16.2         htmlwidgets_1.6.4        
+[205] fgsea_1.30.0              RColorBrewer_1.1-3       
+[207] rlang_1.1.4               spatstat.sparse_3.1-0    
+[209] spatstat.explore_3.3-2    fansi_1.0.6              
+
+
+ +
---
title: "<CENTER>EBAII n1 2024<BR /><B>Gene Set Enrichment Analysis</B></CENTER>"
date: "`r Sys.Date()`"
author:
  - name: "Thibault DAYRIS"
    email: "thibault.dayris@gustaveroussy.fr"
  - name: "Bastien JOB"
    email: "bastien.job@gustaveroussy.fr"
output:
  html_document:  # Defautl view
    highlight: tango  ## Theme for the code chunks
    number_sections: true  ## Adds number to headers (sections)
    theme: flatly  ## CSS theme for the HTML page
    toc: true  ## Adds a table of content
    toc_float:  ## TOC options
      collapsed: true  ## By default, the TOC is folded
      smooth_scroll: true ## Smooth scroll of the HTML page
    self_contained: true ## Includes all plots/images within the HTML
    code_download: true ## Adds a button to download the Rmd
    code_folding: show
    thumbnails: false
    lightbox: true
    fig_caption: true
    gallery: true
    use_bookdown: true
always_allow_html: true ## Allow plain HTML code in the Rmd
editor_options: 
  markdown: 
    wrap: 88
---


<!-- Add the Roscoff banner -->

```{css banner, echo = FALSE}
body {
  background-image: url('ebaii_banner.png');
  background-repeat: no-repeat;
  background-size: 100%;
  margin: 10%
}

div {
  background-color: rgba(255, 255, 255, 0.35)   /* 35% opaque white */;
  padding: 0.25em;
}
```

<!-- Allows to hide the TOC by default, display it with a button, move it to the right or left of the page -->

```{r setup, include=FALSE, echo=FALSE}
knitr::opts_chunk$set(
  echo = TRUE,        # Print the code
  eval = TRUE,       # Do not run command lines
  message = FALSE,    # Print messages
  prompt = FALSE,     # Do not display prompt
  comment = NA,       # No comments on this section
  warning = FALSE,     # Display warnings
  tidy = FALSE,
  fig.align = "center",
  width = 100       # Number of characters per line
)
hooks = knitr::knit_hooks$get()
hook_foldable = function(type) {
  force(type)
  function(x, options) {
    res = hooks[[type]](x, options)

    if (isFALSE(options[[paste0("fold.", type)]])) return(res)

    paste0(
      "<details><summary>Show ", type, "</summary>\n\n",
      res,
      "\n\n</details>"
    )
  }
}
Hmisc::hidingTOC(
  buttonLabel = 'Show TOC', 
  hidden = TRUE, 
  tocSide = 'left', 
  buttonSide='left', 
  posCollapse = 'margin', 
  levels = 3
)
my_seed <- 1337L
```

<!-- CSS to color chunks and outputs -->

```{css chunks, echo = FALSE}
div.beyond pre { background-color: pink; color : black; }
div.beyond pre.r { background-color: lightblue; border: 3px solid blue; }
div.notrun pre { background-color: lightyellow; color : brown; }
div.notrun pre.r { background-color: lightgrey; border: 3px solid black; }
```

<style type="text/css">
details:hover { 
  cursor: pointer 
}
body {
  text-align: justify
}
.column-left{
  float: left;
  width: 47%;
  text-align: left;
}
.column-right{
  float: right;
  width: 47%;
  text-align: left;
}
</style>

# Forewords

## TLDR: R command lines

In this presentation, there will be screen captures for you to follow the 
lesson. There will also be every single R command lines. 
Do not take care of the command lines if you find them too challenging. 
Our goal here, is to understand the main mechanism of Differential 
Expression Analysis. R is just a tool.

Below are the libraries we need to perform this whole session:

```{r load_libraries, eval=TRUE, echo=TRUE}
base::library(package = "BiocParallel")    # Optionally multithread some steps
base::library(package = "DT")              # Display nice table in HTML
base::library(package = "ggplot2")         # Draw graphs and plots
base::library(package = "ggpubr")          # Draw nicer graphs
base::library(package = "rstatix")         # Base R statistics
base::library(package = "knitr")           # Build this presentation
base::library(package = "dplyr")           # Handle big tables
base::library(package = "Seurat")          # Handle SingleCell analyses
base::library(package = "SeuratObject")    # Handle SingleCell objects
base::library(package = "SingleCellExperiment") # Handle SingleCell file formats
base::library(package = "escape")          # Perform exploratory enrichment analysis
base::library(package = "clusterProfiler") # Perform GSEA analysis
base::library(package = "dittoSeq")        # Draw nice plots based on Seurat
base::library(package = "org.Mm.eg.db")    # Mouse genome annotation
base::library(package = "Cairo")           # Graphs library
base::library(package = "pathview")        # For the whole pathway graph
```

First, we load Seurat object:

```{r load_rds_tldr, eval=TRUE, echo=TRUE}
sobj <- base::readRDS(
  # Path to the RDS file
  file = "DEA_Scaled_Normalized_Filtered.RDS"
)
```

Then we launch enrichment exploration on all counts:

```{r escape_enrichment_tldr, eval=FALSE, echo=TRUE}
# Acquire gene sets
mh_hallmark <- escape::getGeneSets(
  species = "Mus musculus",
  library = "H"
)

# Run enrichment
enrichment_matrix <- escape::escape.matrix(
  input.data = sobj,
  gene.sets = mh_hallmark,
  method = "ssGSEA",
  groups =  1000,
  min.size = 5,
  BPPARAM = BiocParallel::SnowParam(workers = 2),
)

# Save enrichment in seurat object
sobj[["escape"]] <- SeuratObject::CreateAssay5Object(
  data=t(enrichment_matrix),
)

saveRDS(sobj, file = "sobj_GSEA.RDS")
```


## Purpose of this session

Up to now, we have:

1. Identified to which cell each sequenced reads come from
1. Identified to which gene each read come from
1. Identified possible bias in gene expression for each cell
1. Filtered and corrected these bias as well as we can
1. Found differentially expressed genes across multiple conditions
1. Annotated cell clusters

We would like to identify the functions of genes among several clusters,
or differentially expressed genes.

At the end of this session you will know:

1. What is gene set analysis
1. How to choose a Gene Set database
1. How to perform an enrichment analysis
1. How to read Gene set analysis results


# Select a database

## Gene: Jund

Let's search information about this gene on the web. For mice, one of the
best web-site for human is: [MGI](https://www.informatics.jax.org/).

If we search for the gene [Plac8](https://www.informatics.jax.org/marker/MGI:2445289),
we find the following:

![plac8_mgi](images/plca8_mgi.png)

As we click on "Phenotype reference", we can see that the MGI database can give
us [PubMeb IDs](https://pubmed.ncbi.nlm.nih.gov/) related to the gene Plca8.
Below, on the same page, we can see regulation pathways, protein ontology,
gene interactions, etc.

![plca8_pathway](images/plac8_pathways.png)

We illustrate here, that GSEA does not only include gene-to-function, or
gene-to-pathway. They usually include many more information.

## Database types

![db_types](images/gene_sets.png)

There is a database for pretty much everything you might want. From
pharmacology, to regulatory objects, etc.

Our dataset contains gene expression. But the same methods can be applied to
other kind of datasets. _E.g._

![ppis](images/ppis.png)

Protein-protein interactions, drug interactions, SNP-interactions, etc.

Within R, you may find genome databases for a wide range of organisms:

![org_bd](images/org_db.png)

## Some noticeable databases

```{r databases_example, eval=TRUE, echo=FALSE}
databases <- base::data.frame(
  MSigDB = c("https://www.gsea-msigdb.org/gsea/index.jsp"),
  KEGG = c("https://www.genome.jp/kegg/"),
  GO = c("https://www.geneontology.org/"),
  Ensembl = c("https://ftp.ensembl.org/pub/"),
  Panther = c("https://pantherdb.org/"),
  Jaspar = c("https://jaspar.genereg.net/"),
  GWAScat = c("https://www.ebi.ac.uk/gwas/"),
  BS_Genome = c("https://bioconductor.org/packages/release/BiocViews.html#___BSgenome"),
  MeSH = c("https://www.nlm.nih.gov/mesh/meshhome.html"),
  OrgDb = c("https://bioconductor.org/packages/release/BiocViews.html#___OrgDb"),
  NCG = c("http://www.network-cancer-genes.org/"),
  PharmGKB = c("https://www.pharmgkb.org/")
)

base::rownames(databases) <- c("Address")

DT::datatable(base::t(databases))
```

For this session, we are going to use MSigBD.

## How to perform GSEA

There are two main types of Gene Set Enrichment Analysis:
1. A blind search among all genes in order to identify pathways or mechanisms among cells.
1. A specific search through genes of interest

The first is called "Exploratory" analysis and starts on the `Seurat` object.

Many packages perform this kind of exploratory analysis, I chose to present 
[`escape`](https://bioconductor.org/packages/release/bioc/html/escape.html)
for its simple and intuitive usage.

What really matters is the GSEA method used in back-end. There are several
very well known methods: [`fgsea`](https://bioconductor.org/packages/release/bioc/html/fgsea.html),
[`gsva`](https://www.bioconductor.org/packages/release/bioc/html/GSVA.html),
[`DOSE`](https://www.bioconductor.org/packages/release/bioc/html/DOSE.html),
and the [`MSigDB`](https://www.gsea-msigdb.org/gsea/index.jsp)'s tool. While
choosing your favorite R package to perform GSEA, if that package uses one
of these methods, then you should not have odd questions through your 
publication process.

These methods are very alike, based on the same statistical resorts, and
suffer for the same limitations. Let's illustrate it with examples.

# Exploratory analysis

## Gene sets

Up to now, we already define what's a _gene set_: a group of genes of interest.
We also know these king of relationships are stored in [`GMT`](https://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats#GMT:_Gene_Matrix_Transposed_file_format_.28.2A.gmt.29)
files or [`TSV`](https://fr.wikipedia.org/wiki/Tabulation-separated_values) 
files.

Since the beginning of our week, we've been working with R, and the question 
naturally comes: how to make/use gene sets in R?

### Create your own gene set

Very easily. We need a "named [list](https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/list)
of [vectors](https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/c)",
all coming from `base` R package.

```{r named_list_vectors, eval=TRUE, echo=TRUE}
# We use the function list from base package
pathways <- base::list(
  # We use the function `c` from base package
  proliferation = base::c(
    "Ifng",	"Kras",	"Mgat5",	
    "Prf1",	"Trem1",	"Trp53"
  ),
  growth = base::c(
    "Cdkn2a", "Fos", "Ifnb1",
    "Rras", "Apc", "Sell"
  )
)
utils::head(pathways)
```

This method is very useful when you want to compare your cells toward
unpublished data, gene sets from an article not sourced in official databases,
etc.

### Load a gene set from disk

We can also load a GMT file from a file. This is very usefull when we
expect both known and new pathways to be tested, or known pathways that 
should be updated.

This is done  with the function [`read.gmt`](https://rdrr.io/bioc/clusterProfiler/man/read-gmt.html)
from the [`clusterProfiler`](https://bioconductor.org/packages/release/bioc/html/clusterProfiler.html)
package.

```{r get_gene_sets_file, eval=TRUE, echo=TRUE}
pathways <- clusterProfiler::read.gmt(
  gmtfile = "m5.mpt.v2023.1.Mm.symbols.gmt"
)
pathways <- BiocGenerics::lapply(
  S4Vectors::split(
    pathways[-1], pathways$term
  ),
  function(x) x[x != ""]
)
utils::head(pathways)
```

### Load a gene set from the web

We can also get pathways from the web using the function [`getGeneSets`](https://rdrr.io/bioc/escape/man/getGeneSets.html)
from package [`escape`](https://rdrr.io/bioc/escape/).

```{r get_gene_sets_msigdb, eval=TRUE, echo=TRUE}
# Pay attention to the genome name, it's case sensitive
# and follows MSigDB naming scheme
mh_hallmark <- escape::getGeneSets(
  species = "Mus musculus", 
  library = "H"
)
utils::head(pathways)
```

## Run exploratory analysis

Let's time to practice. Use the function [`escape.matrix`](https://rdrr.io/github/ncborcherding/escape/man/escape.matrix.html)
from the package [`escape`](https://rdrr.io/bioc/escape/) in order to perform
GSEA on your [`Seurat`]()
object. Perform the analysis on cell cycle phase if possible. 
Save the results in a  variable, for example `sobj_enrich`.

<details>

<summary>Answer</summary>

It is not possible to specify what group we are considering. GSEA/Enrichment 
analysis do not perform any differential analysis.

```{r escape_enrich, eval=TRUE, echo=TRUE}
# Save in a variable called `sobj_enrich`
# the result of the function `runEscape` from package `escape`
enrichment_matrix <- escape::escape.matrix(
  # Out Seurat object
  input.data = sobj,
  # Provide gene sets
  gene.sets = mh_hallmark,
  # Select your favorite method
  method = "ssGSEA",
  # ssGSEA parameters
  groups =  1000,
  min.size = 5,
  # Use 2 threads/CPU/workers
  BPPARAM = BiocParallel::SnowParam(workers = 2),
)
```

If you have reserved more than one core, this function can be multi-threaded,
making it faster !

</details>
<br />


## Visualize results


We can visualise the gene sets enriched in cell clusters using the function [`FeaturePlot`](https://satijalab.org/seurat/reference/featureplot) from
`Seurat` package. We just need to color the cells with `ssGSEA`'s results:

```{r unseen_load, eval=TRUE, echo=FALSE}
sobj <- readRDS("sobj_GSEA.RDS")
```

```{r featureplot_seurat_ssgsea, eval=TRUE, echo=TRUE}
Seurat::FeaturePlot(object = sobj, features = "HALLMARK-DNA-REPAIR")
```

# On purpose analysis

In order to understand what happened, where do these results come from,
we need to go step by step and perform both enrichment and GSE analysis
manually.

In general, the exploratory method is never used on a raw Seurat object,
we usually filter this object in order to search pathways that are related to
our biological question.

With the previous method, what ever our biological question was, the cell
cycle phase was one of the top gene expression drivers.

## Gene Names and Gene identifiers

Let's search information about the gene `ARP2`.

![arp2_multiple](images/arp2_multiple_genome.png)

So, this gene exists in multiple organisms. Thanks, we know we are working
on mice, and we told gene set retriever function to consider 'mouse' datasets.

![arp2_names](images/arp2_name.png)

So in the Human genome, ARP2 refers to multiple genes through the alias names.
When we searched for gene sets with [`escape`], we did not perform any 
disambiguation.

The same with _arabidopsis thaliana_, ARP2 can refer to actin related protein,
or a root development protein.

In _mus musculus_, ARP2 refers to an actin related protein on chromosome 11, or
a subunit adaptor protein on chromosome 5.

We now understand that `escape` analysis may be completely wrong since it used
human-intelligible gene names. These names include synonyms, homonyms, within
a single organism, or between multiple ones.

We need to use unique identifiers. These identifiers are called gene identifiers
and we usually consider [`EnsemblID`](https://ftp.ensembl.org/pub/release-60/gtf/mus_musculus/)
or [`EntrezID`](https://www.ncbi.nlm.nih.gov/Web/Search/entrezfs.html).

For example:

1. ARP2 from chromosome 11 equals [`ENSMUSG00000020152`](http://www.ensembl.org/Mus_musculus/Gene/Summary?g=ENSMUSG00000020152;r=11:20012304-20062913) at Ensembl.
1. ARP2 from chromosome 5 equals [`ENSMUSG00000019518`](http://www.ensembl.org/Mus_musculus/Gene/Summary?g=ENSMUSG00000019518;r=5:138170264-138178691) at Ensembl.

These identifiers are very unpleasant to use for human beings. On monday meeting,
please talk about `ARP2` and not `ENSMUSG00000020152`. However, when machines
are working, please given them `EnsemblID` or `EntrezID`.

Let's translate all gene identifiers and evaluate possible number of errors
in escape analysis.

First, let's load the DEA results based on Wilcoxon analysis:

```{r get_de_gene_names, eval=TRUE, echo=TRUE}
# We load our differential expression analysis result
# using the function `readRDS` from `base` package
sobj_wilcox <- base::readRDS(file = "sobj_wilcox.RDS")
# Copy gene names in a column it makes the future operation easier
sobj_wilcox$SYMBOL <- base::rownames(sobj_wilcox)
```

Then, we use the function [`bitr`](https://rdrr.io/bioc/clusterProfiler/man/bitr.html)
(standing for _biological translator_) from the package [`clusterProfiler`](https://bioconductor.org/packages/release/bioc/html/clusterProfiler.html).

```{r cp_bitr_wilcox, eval=TRUE, echo=TRUE}
# We store the result of `bitr` function
# from `clusterProfiler` package
# in a variable called `annotation`
annotation <- clusterProfiler::bitr(
  # We provide the variable pointing to gene names
  geneID = base::rownames(sobj_wilcox),
  # We tell the translator to which gene identifiers
  # translation must be done
  toType = base::c("ENTREZID", "ENSEMBL"),
  # We tell the translator which type of gene name we have
  fromType = "SYMBOL",
  # We provide mmu database
  OrgDb = org.Mm.eg.db
)
```

```{r disply_annotation, eval=TRUE, echo=FALSE}
DT::datatable(annotation)
```

Now we would like to have these annotation alongside with adjusted pvalues
and fold change information. In order to do so, we use the function [`merge`](https://rdrr.io/r/base/merge.html)
from [`base`](https://rdrr.io/r/) package. Not from [`Seurat`](https://rdrr.io/github/atakanekiz/Seurat3.0/man/merge.Seurat.html)
package ! This is used to merger Seurat objects, but we have dataframes here!

```{r merge_annotation_wilcox, eval=TRUE, echo=TRUE}
# We filter these results on **ADJUSTED** pvalue
sobj_wilcox <- sobj_wilcox[sobj_wilcox$p_val_adj <= 0.05, ]
# Use the function `merge` from package `base` in order
# to add annotation to wixocon DEA result.
sobj_wilcox <- base::merge(
  # Variable pointing to the wilcoxon results
  x = sobj_wilcox,
  # Variable pointing to the annotation results
  y = annotation,
  # We tell how to merge sobj_wilcox
  by.x = "SYMBOL",
  # We tell how to merge annotation
  by.y = "SYMBOL"
)
```

```{r display_sobj_wilcox, eval=TRUE, echo=FALSE}
DT::datatable(sobj_wilcox)
```

## Restricted sed of genes

As we perform the analysis, we are going to provide a numeric 
variable to the GSEA/Enrichment.

We have the following columns:

1. `p_val`: Ignore this column. Always ignore raw p-values. Look at corrected ones, and if they are missing, then compute them.
1. `avg_log2FC`: Average Log2(FoldChange). Illustrates how much a gene is differentially expessed between samples in each condition.
1. `pct.1`: Percent of cells with gene expression in condition one, here in "G1" phase.
1. `pct.2`: Percent of cells with gene expression in condition two, here in "S" phase.
1. `p_val_adj`: The confidence we have in the result. The closer to 0, the lesser is the risk of error.

Is it a good idea to use `p_val` ? What are the consequences ?

<details>

<summary>Answer</summary>

No. Never. Never ever use raw P-Value. It is never a good idea.

</details>
<br />

Is it a good idea to use `avg_log2FC` ? What are the consequences ?

<details>

<summary>Answer</summary>

It is a good idea, we could answer biological questions like : "Considering
differentially expressed genes, what are the pathways with highest expression
change ?"

</details>
<br />

Is it a good idea to use `pct.1` ? What are the consequences ?

<details>

<summary>Answer</summary>

It is a good idea, we could answer biological questions like : "Considering
differentially expressed genes, what are the expression of genes in pathway
XXX in the first condition ?"

</details>
<br />

Is it a good idea to use `pct.2` ? What are the consequences ?

<details>

<summary>Answer</summary>

It is a good idea, we could answer biological questions like : "Considering
differentially expressed genes, what are the expression of genes in pathway
XXX in the second condition ?"

</details>
<br />

Is it a good idea to use `p_val_adj` ? What are the consequences ?

<details>

<summary>Answer</summary>

It is a good idea, we could answer biological questions like : "Which pathways
are differentially expressed with highest confidence interval ?"

But in order to perform surch test, use `-log10(Adjusted P-Value)` instead of
the raw adjusted p-value. Remember, 0 is a good confidence interval, and 1 a
bad one. So we need the values close to 0 to be highly positive.

</details>
<br />

## Enrichment vs GSEA

Previously, we used a function called `enrichIt`. We talked about enrichment
analysis, yet this talk is called GSEA.

This is due to the fact that both techniques exists and are different.

Enrichment tests a list of genes. Their expression, confidence
interval, etc. Nothing else matters than their name.

For each cell, `enrichIt` removes zeros and tests the list of remaining genes.
Their expression does not matter. Their order in the gene list does not matter.
Their impact on the pathway does not matter. Their differential expression
status does not matter.

Behind the scenes, it's a very simple tests answering the following question: 
"Among expressed genes, what are the odds that it belongs to the pathway XXX?"

We can do enrichment tests on our wilxoc results, using the function [`enrichGO`](https://rdrr.io/bioc/clusterProfiler/man/enrichGO.html)
from the package [`clusterProfiler`](https://rdrr.io/bioc/clusterProfiler/):

```{r clusterprofiler_enrichment, eval=TRUE, echo=TRUE}
# We store the result of the function `enrichGO` from package `clusterProfiler`
# in the function `enrich_wilcox`
enrich_wilcox <- clusterProfiler::enrichGO(
  # We provide the list of gene identifiers to test
  gene = sobj_wilcox$ENTREZID,
  # We provide the complete list of genes we quantified
  universe = annotation$ENTREZID,
  # We provide mouse annotations
  OrgDb = org.Mm.eg.db,
  # We tell the function that we use entrez-identifiers
  keyType = "ENTREZID",
  # We search results on Biological Process"
  ont = "BP",
  # We want all the results
  pvalueCutoff = 1,
  # We are humans, we wan human-readable gene names
  readable = TRUE
)
```

```{r display_enrich_wilcox, eval=TRUE, echo=FALSE}
DT::datatable(enrich_wilcox@result)
```

We can have a look at pathways including the word `G2M`, using the functions
[`with`](https://rdrr.io/r/base/with.html) and [`grepl`](https://rdrr.io/r/base/grep.html)
from `base` package.

```{r grep_g2m_enrich_cp, eval=TRUE, echo=TRUE}
# Get the result table
results_enrich_wilcox <- enrich_wilcox@result

# We store the result of the line selection
# in a variable called `g2m_enrich`
g2m_enrich <- results_enrich_wilcox[
  # We select rows containing a given information
  # using the function `with` from `base` package
  base::with(
    # We provide the variable pointing to enrichment results
    data = results_enrich_wilcox,
    # We provide the term we are searching and the column in which
    # the term should be searched
    base::grepl("G2M", Description)
  ), # Do not forget his comma, since we are filtering a dataframe on rows
]
```

```{r display_enrich_G2M_wilcox, eval=TRUE, echo=FALSE}
DT::datatable(g2m_enrich)
```

GSEA tests the gene names, and uses the numeric value associated with
that gene in order to weight the results. It tests the name, the rank,
and the numeric value.

First, we need to create a named list of gene weights. For this example,
we use the average fold change as weights.

```{r gseago_wilcox_prepare_gene_list, eval=TRUE, echo=TRUE}
# We extract gene average fold change
gene_list <- sobj_wilcox$avg_log2FC
# We extract genes Entrez ID
base::names(gene_list) <- sobj_wilcox$ENTREZID
# We sort that list deacreasingly
gene_list <- base::sort(gene_list, decreasing=TRUE)

# Check the results with the function `head` from `utils` package
utils::head(gene_list)
```

And now, we can run the [`gseGO`](https://rdrr.io/bioc/clusterProfiler/man/gseGO.html)
from [`clusterProfiler`](https://rdrr.io/bioc/clusterProfiler/). You'll see,
the interfacce is almost the same as for the enrichment:

```{r gseago_wilcox_run, eval=TRUE, echo=TRUE}
# We use the function `gseGO` from the package `clusterProfiler`
# and store the result in a variable called `gsea_wilcox`
gsea_wilcox <- clusterProfiler::gseGO(
  # We provide the variable pointing to named gene list
  geneList = gene_list,
  # We provide mouse annotations
  OrgDb = org.Mm.eg.db,
  # We tell the function that we use entrez-identifiers
  keyType = "ENTREZID",
  # We search results on Biological Process"
  ont = "BP",
  # We want all the results
  pvalueCutoff = 1
)
```

```{r display_gsea_wilcox, eval=TRUE, echo=FALSE}
DT::datatable(gsea_wilcox@result)
```

Just like before, we can have a look at pathways including the word 
`G2M`, using the functions [`with`](https://rdrr.io/r/base/with.html) 
and [`grepl`](https://rdrr.io/r/base/grep.html) from `base` package.

```{r grep_g2m_gsego_cp, eval=TRUE, echo=TRUE}
# Get the result table
results_gse_wilcox <- enrich_wilcox@result

# We store the result of the line selection
# in a variable called `g2m_enrich`
g2m_gse <- results_gse_wilcox[
  # We select rows containing a given information
  # using the function `with` from `base` package
  base::with(
    # We provide the variable pointing to enrichment results
    data = results_gse_wilcox,
    # We provide the term we are searching and the column in which
    # the term should be searched
    base::grepl("G2/M", Description)
  ), # Do not forget his comma, since we are filtering a dataframe on rows !
]

# Check the results with the function `head` from `utils` package
utils::head(g2m_gse)
```

```{r display_gse_G2M_wilcox, eval=TRUE, echo=FALSE}
DT::datatable(g2m_gse)
```

## Visualize results

This makes a lot of tables. Let's make a lot of graphs !

We can use the function [`barplot`](https://rdrr.io/bioc/enrichplot/man/barplot.enrichResult.html)
from package [`graphics`](https://rdrr.io/bioc/graphics/) ; and not the ones
from any other package, or it won't work !

```{r barplot_ego, eval=TRUE, echo=TRUE, fig.height=10}
# We use the function `barplot` from package `enrichplot`
graphics::barplot(
  # We provide the variable pointing to enrichment results
  height = enrich_wilcox,
  # We display the best 15 results
  howCategory=15
)
```

We can also use the function [`ditplot`](https://rdrr.io/bioc/enrichplot/man/barplot.enrichResult.html)
from package [`enrichplot`](https://rdrr.io/bioc/enrichplot/) ; and not the ones
from any other package, or it won't work ! Note the change in the input
parameter name:

```{r dotplot_ego, eval=TRUE, echo=TRUE, fig.height=10}
# We use the function `barplot` from package `enrichplot`
enrichplot::dotplot(
  # We provide the variable pointing to enrichment results
  object = enrich_wilcox,
  # We display the best 15 results
  showCategory=15
)
```

We can display traditional GSEA graph using [`gseaplot2`](https://rdrr.io/bioc/enrichplot/man/gseaplot2.html)
from package [`enrichplot`](https://rdrr.io/bioc/enrichplot/):

```{r gseaplot, eval=TRUE, echo=TRUE}
# We use the function `barplot` from package `enrichplot`
enrichplot::gseaplot2(
  # We provide the variable pointing to enrichment results
  x = gsea_wilcox,
  # We display the best result
  geneSetID = 1
)
```

With GSEA, you dot not test if a pathway is up or down regulated.
A pathway contains both enhancers and suppressors genes. An
up-regulation of enhancer genes and a down-regulation of suppressor genes
will lead to a “bad” enrichment score. However, this will lead to a strong
change in your pathway activity!

If your favorite pathway does not have a “good enrichment score”, it does
not mean that pathway is not affected.

The [`heatplot`](https://rdrr.io/bioc/enrichplot/man/heatplot.html) displays
both a heatmap of enriched pathways *and* their genes in commons:

```{r heatplot_gsea, echo=TRUE, eval=TRUE, fig.height=10}
# We use the function `heatplot` from `enrichplot` package
enrichplot::heatplot(
  # We probide the variable pointing to GSEA results
  x = gsea_wilcox,
  # We show the 15 best results
  showCategory = 15,
  # We color according to FoldChange
  foldChange = gene_list
)
```

The genes in common between multiple gene sets are also visible through an
uspet plot:

```{r upset_gsea, echo=TRUE, eval=TRUE, fig.height=10}
# We use the function `upsetplot` from `enrichplot` package
enrichplot::upsetplot(
  # We probide the variable pointing to GSEA results
  x = gsea_wilcox,
  # We show the 10 best results
  n = 10
)
```

Finally, we can display whole pathways using KEGG database:

```{r pathview_gene_list, eval=FALSE, echo=TRUE}
# We use the function pathview from pathview package
pathview::pathview(
  gene.data = gene_list, # Our gene list
  pathway.id = "mmu04110", # Our pathway
  species = "mmu", # Our organism
  # The color limits
  limit = list(gene=max(abs(gene_list))),
  gene.idtype = "ENTREZID" # The genes identifiers
)
```

![pathview_results](mmu04110.pathview.png)

## Session Info

This list of all packages used while you work should be included
in each en every R presentation:

```{r session_info, eval=TRUE, echo=TRUE}
utils::sessionInfo()
```
+ + +
+
+ +
+ + + + + + + + + + + + + + + + + diff --git a/2024/evaiin1/SingleCell/DEA_GSEA/environment.yaml b/2024/evaiin1/SingleCell/DEA_GSEA/environment.yaml new file mode 100644 index 0000000..267c410 --- /dev/null +++ b/2024/evaiin1/SingleCell/DEA_GSEA/environment.yaml @@ -0,0 +1,34 @@ +channels: + - conda-forge + - bioconda + - nodefaults +dependencies: + - r-seurat=4.4.0 + - r-seuratobject=4.1.4 + - bioconductor-biocparallel=1.34.2 + - r-dt=0.28 + - r-ggplot2=3.4.3 + - r-ggpubr=0.6.0 + - r-rstatix=0.7.2 + - r-knitr=1.44 + - r-dplyr=1.1.3 + - r-coin=1.4_3 + - r-rmdformats=1.0.4 + - r-rmarkdown=2.25 + - bioconductor-singlecellexperiment=1.22.0 + - bioconductor-clusterprofiler=4.8.1 + - bioconductor-dose=3.26.1 + - bioconductor-org.mm.eg.db=3.17.0 + - bioconductor-gseabase=1.62.0 + - bioconductor-limma=3.56.2 + - bioconductor-deseq2=1.40.2 + - r-remotes=2.4.2 + - r-git2r=0.31.0 + - r-upsetr=1.4.0 + - r-cairo=1.6_1 + - r-ggupset=0.3.0 + - bioconductor-pathview=1.40.0 + - bioconductor-enhancedvolcano=1.18.0 + - bioconductor-escape=1.10.0 + - bioconductor-dittoseq=1.12.0 + - bioconductor-complexheatmap =2.16.0 \ No newline at end of file diff --git a/2024/evaiin1/SingleCell/DEA_GSEA/images/DE_tools.png b/2024/evaiin1/SingleCell/DEA_GSEA/images/DE_tools.png new file mode 100644 index 0000000..c9ff763 Binary files /dev/null and b/2024/evaiin1/SingleCell/DEA_GSEA/images/DE_tools.png differ diff --git a/2024/evaiin1/SingleCell/DEA_GSEA/images/arp2_multiple_genome.png b/2024/evaiin1/SingleCell/DEA_GSEA/images/arp2_multiple_genome.png new file mode 100644 index 0000000..ed75291 Binary files /dev/null and b/2024/evaiin1/SingleCell/DEA_GSEA/images/arp2_multiple_genome.png differ diff --git a/2024/evaiin1/SingleCell/DEA_GSEA/images/arp2_name.png b/2024/evaiin1/SingleCell/DEA_GSEA/images/arp2_name.png new file mode 100644 index 0000000..f37dc9a Binary files /dev/null and b/2024/evaiin1/SingleCell/DEA_GSEA/images/arp2_name.png differ diff --git a/2024/evaiin1/SingleCell/DEA_GSEA/images/gene_sets.png b/2024/evaiin1/SingleCell/DEA_GSEA/images/gene_sets.png new file mode 100644 index 0000000..33dadef Binary files /dev/null and b/2024/evaiin1/SingleCell/DEA_GSEA/images/gene_sets.png differ diff --git a/2024/evaiin1/SingleCell/DEA_GSEA/images/jund_mgi.png b/2024/evaiin1/SingleCell/DEA_GSEA/images/jund_mgi.png new file mode 100644 index 0000000..33d2fe3 Binary files /dev/null and b/2024/evaiin1/SingleCell/DEA_GSEA/images/jund_mgi.png differ diff --git a/2024/evaiin1/SingleCell/DEA_GSEA/images/org_db.png b/2024/evaiin1/SingleCell/DEA_GSEA/images/org_db.png new file mode 100644 index 0000000..7e9a99c Binary files /dev/null and b/2024/evaiin1/SingleCell/DEA_GSEA/images/org_db.png differ diff --git a/2024/evaiin1/SingleCell/DEA_GSEA/images/pathways_jund.png b/2024/evaiin1/SingleCell/DEA_GSEA/images/pathways_jund.png new file mode 100644 index 0000000..5795e71 Binary files /dev/null and b/2024/evaiin1/SingleCell/DEA_GSEA/images/pathways_jund.png differ diff --git a/2024/evaiin1/SingleCell/DEA_GSEA/images/plac8_pathways.png b/2024/evaiin1/SingleCell/DEA_GSEA/images/plac8_pathways.png new file mode 100644 index 0000000..bd937b3 Binary files /dev/null and b/2024/evaiin1/SingleCell/DEA_GSEA/images/plac8_pathways.png differ diff --git a/2024/evaiin1/SingleCell/DEA_GSEA/images/plca8_mgi.png b/2024/evaiin1/SingleCell/DEA_GSEA/images/plca8_mgi.png new file mode 100644 index 0000000..53beb2a Binary files /dev/null and b/2024/evaiin1/SingleCell/DEA_GSEA/images/plca8_mgi.png differ diff --git a/2024/evaiin1/SingleCell/DEA_GSEA/images/ppis.png b/2024/evaiin1/SingleCell/DEA_GSEA/images/ppis.png new file mode 100644 index 0000000..11347eb Binary files /dev/null and b/2024/evaiin1/SingleCell/DEA_GSEA/images/ppis.png differ diff --git a/2024/evaiin1/SingleCell/DEA_GSEA/m5.mpt.v2023.1.Mm.symbols.gmt b/2024/evaiin1/SingleCell/DEA_GSEA/m5.mpt.v2023.1.Mm.symbols.gmt new file mode 100644 index 0000000..2df1635 --- /dev/null +++ b/2024/evaiin1/SingleCell/DEA_GSEA/m5.mpt.v2023.1.Mm.symbols.gmt @@ -0,0 +1,10 @@ +MP_ABNORMAL_TUMOR_VASCULARIZATION http://www.gsea-msigdb.org/gsea/msigdb/mouse/geneset/MP_ABNORMAL_TUMOR_VASCULARIZATION Adamts12 Aggf1 Amotl1 Anxa1 Arhgef4 Cav2 Ccm2l Cd82 Dock4 Fkbpl Gpr4 H2ax Idh2 Ifngr1 Il12rb2 Il1a Il1b Itga6 Itgb3 Jcad Mmrn2 Myct1 Ntn4 Peak1 Pld1 Rhoj Rras S1pr2 Stard13 +MP_DECREASED_CIRCULATING_TUMOR_NECROSIS_FACTOR_LEVEL http://www.gsea-msigdb.org/gsea/msigdb/mouse/geneset/MP_DECREASED_CIRCULATING_TUMOR_NECROSIS_FACTOR_LEVEL Adam17 Apba3 Arid5a Bbs12 C3 Cd14 Cd19 Cd44 Clic4 Enpp2 F3 Ffar2 Foxn1 Foxo1 Gba Hectd3 Ifi204 Ifi35 Ifit2 Ifngr1 Ikbkb Il22 Il6ra Irak2 Irak4 Irf5 Lacc1 Lep Lrrc19 Lrrc8e Mapkapk2 Mbl1 Mcpt1 Mif Msmp Mstn Myd88 Nfkbib Nkg7 Nmi Npy1r Parp1 Peli1 Pilrb1 Plekhf1 Prkce Rhbdf2 Sgms1 Siglec1 Snx8 Sra1 Thbd Ticam1 Tlr2 Tnf Tradd Zdhhc1 +MP_DECREASED_INCIDENCE_OF_INDUCED_TUMORS http://www.gsea-msigdb.org/gsea/msigdb/mouse/geneset/MP_DECREASED_INCIDENCE_OF_INDUCED_TUMORS Ccng1 Cd151 Cdk1 Cebpb Ct55 Cyp2c23 Eif2s2 H2-Ob Il6 Klk6 Krt10 Lrig2 Pgr Terc Uqcc3 Zmiz1 a +MP_DECREASED_INCIDENCE_OF_TUMORS_BY_CHEMICAL_INDUCTION http://www.gsea-msigdb.org/gsea/msigdb/mouse/geneset/MP_DECREASED_INCIDENCE_OF_TUMORS_BY_CHEMICAL_INDUCTION Agtr2 Ahrr Ar Arl6ip5 Birc5 Brca2 Cacfd1 Camk2g Ccl2 Cd151 Cd34 Cd96 Cdkn1b Cdkn2a Cebpb Chd1l Cyp1a2 Cyp1b1 Cyp2e1 Dek Dnajc27 Endov Ephx1 Fgf22 Fntb Foxm1 Foxn1 Fxyd5 Fzr1 Gata6os Hcst Hipk1 Hras Ier3 Il12b Il1r1 Il23a Il6 Il6ra Itgb1 Kdm4c Klk6 Lonp1 Loxl2 Lypd3 Mapkapk2 Mgat3 Mif Mir21a Mir301 Mki67 Mmp11 Mmp19 Mmp1a Mpg Mtdh Muc4 Myd88 Neu3 Pard3 Pik3ca Pla2g4a Plce1 Prdx1 Pten Ptger1 Ptger2 Ptger4 Ptgs2 Ptk2 Ptp4a3 Ptprj Pvr Pycard Ralgds Retnlb Skil Slc3a2 Spp1 Stag1 Stat3 Strap Terc Tert Tiam1 Tlr4 Tnf Tnfaip8l3 Tnik Trem1 Trim27 Trp53 Uri1 +MP_DECREASED_LIVER_TUMOR_INCIDENCE http://www.gsea-msigdb.org/gsea/msigdb/mouse/geneset/MP_DECREASED_LIVER_TUMOR_INCIDENCE Foxm1 Mtdh Myd88 Tert Tlr2 +MP_DECREASED_LUNG_TUMOR_INCIDENCE http://www.gsea-msigdb.org/gsea/msigdb/mouse/geneset/MP_DECREASED_LUNG_TUMOR_INCIDENCE Ggta1 Kras Lexm Met Mtdh Tlr2 +MP_DECREASED_METASTATIC_POTENTIAL http://www.gsea-msigdb.org/gsea/msigdb/mouse/geneset/MP_DECREASED_METASTATIC_POTENTIAL Anxa1 Atf3 Bcl11b Ccm2l Ccr5 Cd96 Cxcr2 Cybb Fbln5 Fos Fut7 Hcst Id1 Ifngr1 Il1a Il1b Jam3 Lyst Mgat5 Mmp9 Nbeal2 Npr1 Plau Pld1 Rhoj Sell Stab2 Stat6 Tmem219 Vegfa Vegfd +MP_DECREASED_TUMOR_GROWTH_SIZE http://www.gsea-msigdb.org/gsea/msigdb/mouse/geneset/MP_DECREASED_TUMOR_GROWTH_SIZE Adam15 Adam9 Antxr1 Anxa1 Apc Arhgef4 Arl6ip5 Cav2 Ccm2l Ccr2 Cd151 Cd248 Cd44 Cdc20 Cdk1 Cxcr2 Cybb Dnajc27 Dot1l Ets2 Fbln5 Fgl1 Fos Fzr1 Gpnmb Gpr4 Gpr68 Gzmb H2ax Hcst Hif1a Id1 Idh2 Igf1 Il12rb2 Il1b Il6ra Jcad Kit Kras Lag3 Loxl2 Mgat5 Mmp1a Mmp2 Msi2 Muc1 Myct1 Nrp1 Nup85 Pard3 Pdcd1 Pgf Plau Pld1 Plg Ptch1 Ptger1 Ptger2 Ptk2 Ptp4a3 Ralgds Rbpj Rcan1 Rhoj Setd4 Skil Slc3a2 Stag1 Trim27 Trp53 +MP_DECREASED_TUMOR_INCIDENCE http://www.gsea-msigdb.org/gsea/msigdb/mouse/geneset/MP_DECREASED_TUMOR_INCIDENCE Adam15 Aggf1 Brca2 Bub1 Cd47 Cdkn1b Cdkn2a Chd1l Esam Espl1 Ifng Igf2bp2 Il10 Mir301 Mmp9 Myc Pten Sell Spn Stk11 Tnfrsf1a Trap1 Trp53 +MP_DECREASED_TUMOR_LATENCY http://www.gsea-msigdb.org/gsea/msigdb/mouse/geneset/MP_DECREASED_TUMOR_LATENCY Aldh2 Atp6v0a2 Brca2 Bub1 Bub1b Dclre1a Eef1a1 Egr1 Erbb2 Ercc2 Fbxw7 Foxo3 Ifng Il22ra2 Klf10 Lox Mcm3 Mir203 Nbn Pard3 Pcsk6 Perp Prf1 Ranbp2 Rasal2 Tnfaip8l3 Trim16 Trp53