From 3de0c608e0f1cc510404326705fc5a02356f5fb9 Mon Sep 17 00:00:00 2001
From: Pratibha Panwar <pratibhapanwar.4@gmail.com>
Date: Thu, 19 Sep 2024 19:01:33 +1000
Subject: [PATCH] Updated vignette content.

---
 R/clustering.R            |  2 ++
 README.md                 |  7 +++--
 man/nsClustering.Rd       |  3 ++
 vignettes/clustSIGNAL.Rmd | 63 ++++++++++++++++++++-------------------
 4 files changed, 42 insertions(+), 33 deletions(-)

diff --git a/R/clustering.R b/R/clustering.R
index 4a61dac..c337d43 100644
--- a/R/clustering.R
+++ b/R/clustering.R
@@ -10,6 +10,8 @@
 #' @param spe SpatialExperiment object. For reclust = FALSE, the object should
 #' contain logcounts and PCA, but for reculst = TRUE, the object should contain
 #' smoothed gene expression.
+#' @param samples a character indicating name of colData(spe) column containing
+#' sample names.
 #' @param dimRed a character indicating the name of the reduced dimensions to
 #' use from the SpatialExperiment object (i.e., from reducedDimNames(spe)).
 #' Default value is 'PCA'.
diff --git a/README.md b/README.md
index 327c6b4..fe061d0 100644
--- a/README.md
+++ b/README.md
@@ -4,9 +4,9 @@
 
 clustSIGNAL: ***clust***ering of ***S***patially ***I***nformed ***G***ene expression with ***N***eighbourhood ***A***dapted ***L***earning.
 
-An R package to perform spatial clustering on spatially-resolved transcriptomics datasets. Here, we calculate entropy as a measure of "domainness" of cell neighbourhoods, and use it to generate weight distributions to perform adaptive smoothing of gene expression. Homogeneous neighbourhoods have low entropy, and so, smoothing is performed over more cells in these neighbourhoods. Contrarily, heterogeneous neighbourhoods have high entropy and are smoothed over a much smaller region. This approach not only overcomes data sparsity in the gene expression but also incorporates spatial context in the form of cell arrangement information from the neighbourhood. The resulting adaptively smoothed gene expression is used for downstream analyses like clustering.
+An R package to perform spatially-resolved clustering on spatial transcriptomics data. Here, we calculate entropy as a measure of "domainness" of cell neighbourhoods and use it to generate weight distributions to perform adaptive smoothing of gene expression. Homogeneous neighbourhoods have low entropy, and so, smoothing is performed over more cells in these neighbourhoods. Contrarily, heterogeneous neighbourhoods have high entropy and are smoothed over a much smaller region. This approach not only overcomes data sparsity in the gene expression but also incorporates spatial context in the form of cell arrangement information from the neighbourhood. The resulting adaptively smoothed gene expression is used for downstream analyses like clustering.
 
-For tutorials on how to use clustSIGNAL, see the vignettes at this [website](https://sydneybiox.github.io/clustSIGNAL/).
+For a tutorial on how to use clustSIGNAL, see the vignette at this [website](https://sydneybiox.github.io/clustSIGNAL/).
 
 ## Installation
 
@@ -79,5 +79,6 @@ data(example)
 
 # Here, the cell labels are in the column 'uniqueID' and sample labels are in 'sample_id' column.
 set.seed(100)
-res <- clustSIGNAL(spe, samples = "sample_id", cells = "uniqueID", cluster.fun = "leiden", outputs = "a")
+res <- clustSIGNAL(spe, samples = "sample_id", cells = "uniqueID", 
+                   cluster.fun = "leiden", outputs = "a")
 ```
diff --git a/man/nsClustering.Rd b/man/nsClustering.Rd
index dc3b819..666ad25 100644
--- a/man/nsClustering.Rd
+++ b/man/nsClustering.Rd
@@ -11,6 +11,9 @@ nsClustering(spe, samples, dimRed = "PCA", batch = FALSE, reclust, ...)
 contain logcounts and PCA, but for reculst = TRUE, the object should contain
 smoothed gene expression.}
 
+\item{samples}{a character indicating name of colData(spe) column containing
+sample names.}
+
 \item{dimRed}{a character indicating the name of the reduced dimensions to
 use from the SpatialExperiment object (i.e., from reducedDimNames(spe)).
 Default value is 'PCA'.}
diff --git a/vignettes/clustSIGNAL.Rmd b/vignettes/clustSIGNAL.Rmd
index c822adb..d1154f2 100644
--- a/vignettes/clustSIGNAL.Rmd
+++ b/vignettes/clustSIGNAL.Rmd
@@ -22,7 +22,7 @@ knitr::opts_chunk$set(
 
 # Overview
 
-In this vignette, we will demonstrate how to perform spatially-resolved clustering with clusSIGNAL on a dataset containing only one sample. Following this, we will explore the clusters using pre-defined metrics like adjusted rand index (ARI), normalised mutual information (NMI), average silhouette width, and spatial plots. We will also display the use of entropy measures generated as a by-product of clustSIGNAL process in understanding the tissue structure of the sample. Furthermore, the adaptively-smoothed gene expression data generated by clustSIGNAL could be useful for other downstream analyses and will be accessible to the user if they choose to output the final SpatialExperiment object.
+In this vignette, we will demonstrate how to perform spatially-resolved clustering with clustSIGNAL. Following this, we will explore the clusters using pre-defined metrics like adjusted rand index (ARI), normalised mutual information (NMI), and average silhouette width, as well as spatial plots. We will also display the use of entropy measures generated as a by-product of clustSIGNAL process in understanding the tissue structure of a sample. In the end, we will also explore multisample analysis with clustSIGNAL.
 
 ```{r setup, message = FALSE, warning = FALSE}
 # load required packages
@@ -36,11 +36,11 @@ library(patchwork)
 library(scattermore)
 ```
 
-# Single sample data analysis 
+# Single sample analysis with clustSIGNAL
 
-Here, we use the SeqFISH mouse embryo dataset from [Lohoff et al, 2021](https://www.nature.com/articles/s41587-021-01006-2), which contains spatial transcriptomics data from 3 mouse embryos, with 351 genes and a total of 57536 cells. For this vignette, we subset the data by randomly selecting 5000 cells from Embryo 2, excluding cells that were manually annotated as 'Low quality'.
+Here, we use the SeqFISH mouse embryo dataset from [Lohoff et al, 2021](https://www.nature.com/articles/s41587-021-01006-2), which contains spatial transcriptomics data from 3 mouse embryos, with 351 genes and a total of 57,536 cells. For this vignette, we subset the data by randomly selecting 5000 cells from Embryo 2, excluding cells that were manually annotated as 'Low quality'.
 
-We begin by creating a SpatialExperiment object from the gene expression and cell information in the data subset, ensuring that the spatial coordinates are stored in spatialCoords within the SpatialExperiment object.
+We begin by creating a SpatialExperiment object from the gene expression and cell information in the data subset, ensuring that the spatial coordinates are stored in spatialCoords within the SpatialExperiment object. If the data are already in a SpatialExperiment object, then the user can directly run clustSIGNAL, after ensuring that the basic requirements like spatial coordinates and normalized counts are met.
 
 ```{r}
 data(mEmbryo2)
@@ -55,10 +55,11 @@ For running clustSIGNAL, we need to know the column names in colData of the Spat
 colnames(colData(spe))
 ```
 
-
 # Running clustSIGNAL on one sample
 
-Next, we run clustSIGNAL using the sample and cell labels we identified earlier. The simplest clustSIGNAL run requires a SpatialExperiment object, two variables holding colData column names containing sample and cell labels, and the type of output the user would like to see. Other parameters that can be modified include dimRed to specify the low dimension data to use, batch to perform batch correction, NN to specify the neighbourhood size, kernel for weight distribution to use, spread for distribution spread value, sort to sort the neighbourhood, threads to specify the number of cpus to use in parallel runs, and ... for additional parameters for clustering steps. 
+Next, we run clustSIGNAL using the sample and cell labels we identified earlier. The simplest clustSIGNAL run requires a SpatialExperiment object, two variables holding colData column names containing sample and cell labels, and the type of output the user would like to see. Other parameters that can be modified include dimRed to specify the low dimension data to use, batch to perform batch correction, NN to specify the neighbourhood size, kernel for weight distribution to use, spread for distribution spread value, sort to sort the neighbourhood, threads to specify the number of cpus to use in parallel runs, and ... for additional parameters for clustering steps.
+
+Furthermore, the adaptively smoothed gene expression data generated by clustSIGNAL could be useful for other downstream analyses and will be accessible to the user if they choose to output the final SpatialExperiment object.
 
 ```{r}
 set.seed(100)
@@ -79,16 +80,17 @@ The cluster dataframe contains cell labels and their cluster numbers allotted by
 head(res_emb$clusters, n = 3)
 ```
 
-The final SpatialExperiment object contains the adaptively smoothed gene expression data as an additional assay, as well initial clusters, entropy values, and clustSIGNAL clusters. 
+The final SpatialExperiment object contains the adaptively smoothed gene expression data as an additional assay, as well initial clusters, entropy values, and clustSIGNAL clusters.
 
 ```{r}
 spe <- res_emb$spe_final
 spe
 ```
 
+
 # Analysing clustSIGNAL results
 
-In this section, we analyse the results from clustSIGNAL through spatial plots and clustering metrics. 
+In this section, we analyse the results from clustSIGNAL through spatial plots and clustering metrics.
 
 ## Visualising clustSIGNAL clusters
 
@@ -112,7 +114,7 @@ df_ent <- as.data.frame(colData(spe))
 spt_clust <- df_ent %>%
   ggplot(aes(x = spatialCoords(spe)[, 1],
              y = -spatialCoords(spe)[, 2])) +
-  geom_scattermore(pointsize = 3, aes(colour = reCluster)) +
+  geom_scattermore(pointsize = 3, aes(colour = clustSIGNAL)) +
   scale_color_manual(values = colors) +
   ggtitle("A") +
   labs(x = "x-coordinate", y = "y-coordinate") +
@@ -123,18 +125,18 @@ spt_clust <- df_ent %>%
 
 # calculating median entropy of each cluster
 celltype_ent <- df_ent %>%
-  group_by(as.character(reCluster)) %>%
+  group_by(as.character(clustSIGNAL)) %>%
   summarise(meanEntropy = median(entropy))
 # reordering clusters by their median entropy
 # low to high median entropy
 cellOrder <- celltype_ent$meanEntropy
-names(cellOrder) <- celltype_ent$`as.character(reCluster)`
+names(cellOrder) <- celltype_ent$`as.character(clustSIGNAL)`
 cellOrder <- sort(cellOrder)
-df_ent$reCluster <- factor(df_ent$reCluster, levels = names(cellOrder))
+df_ent$clustSIGNAL <- factor(df_ent$clustSIGNAL, levels = names(cellOrder))
 # box plot of cluster entropy
 colors_ent <- colors[as.numeric(names(cellOrder))]
 box_clust <- df_ent %>%
-  ggplot(aes(x = reCluster, y = entropy, fill = reCluster)) +
+  ggplot(aes(x = clustSIGNAL, y = entropy, fill = clustSIGNAL)) +
   geom_boxplot() +
   scale_fill_manual(values = colors_ent) +
   ggtitle("B") +
@@ -147,7 +149,7 @@ box_clust <- df_ent %>%
 spt_clust + box_clust + patchwork::plot_layout(guides = "collect", widths = c(2, 3))
 ```
 
-The spatial location (A) and entropy distribution (B) of the clusters provide spatial context of the cells and their neighbourhoods, as well as the compositions of the neighbourhoods. For example, the low entropy of cluster 4 indicates that the cells in this cluster are generally found in more homogeneous space, whereas the high entropy of cluster 7 cells indicates that they belong to regions with more cell diversity. The spatial plot (A) concurs with this entropy-based observation. 
+The spatial location (A) and entropy distribution (B) of the clusters provide spatial context of the cells and their neighbourhoods, as well as the compositions of the neighbourhoods. For example, the low entropy of cluster 4 indicates that the cells in this cluster are generally found in more homogeneous space, whereas the high entropy of cluster 7 cells indicates that they belong to regions with more cell diversity. This can also be visualized in the spatial plot.
 
 ## Cluster metrics
 
@@ -155,7 +157,7 @@ We assess the clustering efficiency of clustSIGNAL using the commonly used clust
 
 ```{r}
 # average silhouette width
-clusts <- as.numeric(as.character(spe$reCluster))
+clusts <- as.numeric(as.character(spe$clustSIGNAL))
 cXg_mat <- t(as.matrix(logcounts(spe)))
 distMat <- distances(cXg_mat)
 silCluster <- as.matrix(silhouette(clusts, distMat))
@@ -163,14 +165,14 @@ spe$rcSil <- silCluster[, 3]
 
 # ARI and NMI
 as.data.frame(colData(spe)) %>%
-  summarise(ARI = aricode::ARI(celltype_mapped_refined, reCluster),
-            NMI = aricode::NMI(celltype_mapped_refined, reCluster),
+  summarise(ARI = aricode::ARI(celltype_mapped_refined, clustSIGNAL),
+            NMI = aricode::NMI(celltype_mapped_refined, clustSIGNAL),
             ASW = mean(rcSil))
 ```
 
 ## Entropy spread and distribution
 
-The entropy values generated through clustSIGNAL process can be useful in analyzing the sample structure. The entropy range can indicate whether the tissue sample contains any homogeneous domain-like structures. For example, here the minimum entropy value is 0, which means some cells are placed in completely homogeneous space when looking at neighbourhood sizes of 30 cells (since NN = 30 used for generating entropy data). Moreover, the mean entropy value is low, which can be interpreted as the tissue having at least some domain-like structures containing 30 cells.
+The entropy values generated through clustSIGNAL process can be useful in analyzing the sample structure. The entropy range can indicate whether the tissue sample contains any homogeneous domain-like structures. For example, here the minimum entropy value is 0, which means some cells are placed in completely homogeneous space when looking at neighbourhood size of 30 cells (NN = 30 was used for generating this entropy data). Moreover, the mean entropy value is low, which can be interpreted as the tissue having at least some domain-like structures.
 
 ```{r}
 # Data assessment - Overall entropy
@@ -208,9 +210,10 @@ hst_ent + spt_ent
 
 The spread (A) and spatial distribution (B) of region entropy measures can be very useful in assessing the tissue composition of samples - low entropy regions are more homogeneous with domain-like structure, whereas high entropy regions are heterogeneous with more uniform distribution of cells.
 
+
 # Generating entropy data only
 
-To evaluate tissue structure using entropy values, we can run clustSIGNAL up to the entropy measurement step, without running the complete method. The entropy values will be added to the SpatialExperiment object.
+To evaluate tissue structure using entropy values, we can run clustSIGNAL up to the entropy measurement step, without running the complete method. The entropy values will be added to the SpatialExperiment object and can be used for assessing tissue structure.
 
 ```{r}
 data(mEmbryo2)
@@ -227,6 +230,7 @@ spe <- entropyMeasure(spe, cells = "uniqueID", outReg$regXclust)
 head(spe$entropy)
 ```
 
+
 # Multisample analysis with clustSIGNAL
 
 Here, we use the MERFISH mouse hypothalamic preoptic region dataset from [Moffitt et al, 2018](https://www.science.org/doi/10.1126/science.aau5324), which contains spatial transcriptomics data from 181 samples, with 155 genes and a total of 1,027,080 cells. For this vignette, we subset the data by selecting a total of 6000 random cells from only 3 samples - Animal 1 Bregma -0.09 (2080 cells), Animal 7 Bregma 0.16 (1936 cells), and Animal 7 Bregma -0.09 (1984 cells), excluding cells that were manually annotated as 'ambiguous' and 20 genes that were assessed using a different technology.
@@ -248,7 +252,7 @@ colnames(colData(spe2))
 
 ## clustSIGNAL run
 
-One of the important concepts to take into account when running multisample analysis is batch effects. For samples gathered from different sources or through different technologies/procedures, some technical batch effects might be observed. We run clustSIGNAL in batch correction mode simply by setting batch = TRUE. The method then uses [harmony](https://portals.broadinstitute.org/harmony/) internally for batch correction.
+One of the important concepts to take into account when running multisample analysis is batch effects. When gathering samples from different sources or through different technologies/procedures, some technical batch effects might be introduced into the dataset. We run clustSIGNAL in batch correction mode simply by setting batch = TRUE. The method then uses [harmony](https://portals.broadinstitute.org/harmony/) internally for batch correction.
 
 ```{r}
 set.seed(101)
@@ -276,7 +280,7 @@ samplesList
 silWidthRC <- matrix(nrow = 0, ncol = 3)
 for (s in samplesList) {
   speX <- spe2[, spe2[[samples]] == s]
-  clust_sub <- as.numeric(as.character(speX$reCluster))
+  clust_sub <- as.numeric(as.character(speX$clustSIGNAL))
   cXg <- t(as.matrix(logcounts(speX)))
   distMat <- distances(cXg)
   silCluster <- as.matrix(silhouette(clust_sub, distMat))
@@ -286,8 +290,8 @@ spe2$rcSil <- silWidthRC[, 3]
 
 as.data.frame(colData(spe2)) %>%
   group_by(samples) %>%
-  summarise(ARI = aricode::ARI(Cell_class, reCluster),
-            NMI = aricode::NMI(Cell_class, reCluster),
+  summarise(ARI = aricode::ARI(Cell_class, clustSIGNAL),
+            NMI = aricode::NMI(Cell_class, clustSIGNAL),
             ASW = mean(rcSil),
             min_Entropy = min(entropy),
             max_Entropy = max(entropy),
@@ -296,7 +300,7 @@ as.data.frame(colData(spe2)) %>%
 
 ## Visualizing clustSIGNAL clusters
 
-clustSIGNAL performs clustering on all cells in the dataset in one setting, thereby generating the same clusters across multiple samples. The user does not need to map cluster labels between samples. For example, cluster 1 represents the same cell type in all three samples, without needing explicit mapping between samples.
+clustSIGNAL performs clustering on all cells in the dataset in one run, thereby generating the same clusters across multiple samples. The user does not need to map cluster labels between samples. For example, cluster 1 represents the same cell type in all three samples, without needing explicit mapping between samples.
 
 ```{r}
 df_ent <- as.data.frame(colData(spe2))
@@ -305,7 +309,7 @@ df_ent <- as.data.frame(colData(spe2))
 spt_clust <- df_ent %>%
     ggplot(aes(x = spatialCoords(spe2)[, 1],
                y = -spatialCoords(spe2)[, 2])) +
-    geom_scattermore(pointsize = 3, aes(colour = reCluster)) +
+    geom_scattermore(pointsize = 3, aes(colour = clustSIGNAL)) +
     scale_color_manual(values = colors) +
     facet_wrap(vars(samples), scales = "free", nrow = 1) +
     labs(x = "x-coordinate", y = "y-coordinate") +
@@ -320,19 +324,19 @@ for (s in samplesList) {
   df_ent_sub <- as.data.frame(colData(spe2)[spe2[[samples]] == s, ])
   # calculating median entropy of each cluster in a sample
   celltype_ent <- df_ent_sub %>%
-    group_by(as.character(reCluster)) %>%
+    group_by(as.character(clustSIGNAL)) %>%
     summarise(meanEntropy = median(entropy))
   # reordering clusters by their median entropy
   # low to high median entropy
   cellOrder <- celltype_ent$meanEntropy
-  names(cellOrder) <- celltype_ent$`as.character(reCluster)`
+  names(cellOrder) <- celltype_ent$`as.character(clustSIGNAL)`
   cellOrder = sort(cellOrder)
-  df_ent_sub$reCluster <- factor(df_ent_sub$reCluster, levels = names(cellOrder))
+  df_ent_sub$clustSIGNAL <- factor(df_ent_sub$clustSIGNAL, levels = names(cellOrder))
 
   # box plot of cluster entropy
   colors_ent <- colors[as.numeric(names(cellOrder))]
   box_clust[[s]] <- df_ent_sub %>%
-    ggplot(aes(x = reCluster, y = entropy, fill = reCluster)) +
+    ggplot(aes(x = clustSIGNAL, y = entropy, fill = clustSIGNAL)) +
     geom_boxplot() +
     scale_fill_manual(values = colors_ent) +
     facet_wrap(vars(samples), nrow = 1) +
@@ -385,7 +389,6 @@ hst_ent / spt_ent + plot_layout(heights = c(3,5)) +
     plot_annotation(title = "Entropy spread (top) and spatial distribution (bottom)")
 ```
 
-
 <details>
 
 <summary>**Session Information**</summary>