FredHutch · cansavvy · Dec 5, 2024 · Dec 5, 2024 · Dec 5, 2024
diff --git a/R/00-setup_data.R b/R/00-setup_data.R
@@ -24,7 +24,6 @@ setup_data <- function(counts = NULL,
     raw_counts = NULL,
     counts_per_sample = NULL,
     transformed_data = list(
-      count_norm = NULL,
       cpm = NULL,
       log2_cpm = NULL
     ),
@@ -42,8 +41,10 @@ setup_data <- function(counts = NULL,
     comparisons = NULL,
     annotation = NULL,
     normalized_log_fc = NULL,
-    crispr_score = NULL,
-    results = NULL
+    single_crispr_score = NULL,
+    double_crispr_score = NULL,
+    gi_scores = NULL,
+    overall_results = NULL
   )
 
   class(new_data) <- c("list", "gimap_dataset")
@@ -77,7 +78,6 @@ setup_data <- function(counts = NULL,
   new_data$counts_per_sample <- apply(counts, 2, sum)
 
   # Transform the data
-  new_data$transformed_data$count_norm <- apply(counts, 2, function(x) -log10((x + 1) / sum(x)))
   new_data$transformed_data$cpm <- apply(counts, 2, function(x) (x / sum(x)) * 1e6)
   new_data$transformed_data$log2_cpm <- log2(new_data$transformed_data$cpm + 1)
 

diff --git a/R/04-normalize.R b/R/04-normalize.R
@@ -1,5 +1,23 @@
 #' Normalize Log fold changes
-#' @description This calculates the log fold change for a gimap dataset based on the annotation and metadata provided.
+#' @description This calculates the log fold change for a gimap dataset based on
+#' the annotation and metadata provided.
+#' gimap takes in a counts matrix that represents the number of cells that have
+#' each type of pgRNA this data needs some normalization before CRISPR scores and
+#' Genetic Interaction scores can be calculated.
+#'
+#' There are four steps of normalization.
+#' 1. `Calculate log2CPM` - First we account for different read depths across samples
+#' and transforms data to log2 counts per million reads.
+#' `log2((counts / total counts for sample)) * 1 million) + 1)`
+#' 2. `Calculate log2 fold change` - This is done by subtracting the log2CPM for
+#' the pre-treatment from each sample.  control is what is highlighted.
+#' The pretreatment is the day 0 of CRISPR treatment, before CRISPR pgRNAs have taken effect.
+#' `log2FC = log2CPM for each sample - pretreament log2CPM`
+#'
+#' 3. `Normalize by negative and positive controls` - Calculate a negative control
+#' median for each sample and a positive control median for each sample and divide each log2FC by this value.
+#' `log2FC adjusted = log2FC / (median negative control for a sample - median positive control for a sample)`
+#'
 #' @param .data Data can be piped in with a tidyverse pipe from function to function. But the data must still be a gimap_dataset
 #' @param gimap_dataset A special dataset structure that is setup using the `setup_data()` function.
 #' @param timepoints Specifies the column name of the metadata set up in `$metadata$sample_metadata`
@@ -187,14 +205,14 @@ gimap_normalize <- function(.data = NULL,
     mutate(across(names(neg_control_median), ~ . - neg_control_median[cur_column()])) %>%
     tidyr::pivot_longer(dplyr::ends_with(treatment_group_names),
       names_to = "rep",
-      values_to = "lfc_adj1"
+      values_to = "lfc"
     ) %>%
     group_by(rep) %>%
     dplyr::mutate(
       # Then, divide by the median of negative controls (double non-targeting) minus
       # median of positive controls (targeting 1 essential gene).
       # This will effectively set the median of the positive controls (essential genes) to -1.
-      lfc_adj = lfc_adj1 / (median(lfc_adj1[norm_ctrl_flag == "negative_control"], na.rm = TRUE) - median(lfc_adj1[norm_ctrl_flag == "positive_control"], na.rm = TRUE))
+      lfc_adj = lfc / (median(lfc[norm_ctrl_flag == "negative_control"], na.rm = TRUE) - median(lfc[norm_ctrl_flag == "positive_control"], na.rm = TRUE))
     ) %>%
     ungroup()
 

diff --git a/R/05-crispr-calc.R b/R/05-crispr-calc.R
@@ -1,5 +1,10 @@
 #' Calculate CRISPR scores
-#' @description This calculates the log fold change for a gimap dataset based on the annotation and metadata provided.
+#' @description This calculates CRISPR scores for a gimap dataset based on the annotation and metadata provided.
+#' Since the pgPEN library uses non-targeting controls, we adjust for the fact that single-targeting pgRNAs generate only two double-strand breaks (1 per allele), whereas the double-targeting pgRNAs generate four DSBs. To do this, we set the median LFC of each group to zero.
+#'
+#'Calculate medians of based on single and double targeting and subtract these medians from `log2FC adjusted`
+#' `crispr score = log2FC adjusted - median for each target type`
+#'
 #' @param .data Data can be piped in with tidyverse pipes from function to function. But the data must still be a gimap_dataset
 #' @param gimap_dataset A special dataset structure that is setup using the `setup_data()` function.
 #' @export
@@ -45,13 +50,13 @@ calc_crispr <- function(.data = NULL,
 
   # Calculate medians based on single, double targeting as well as if they are unexpressed control genes
   medians_df <- source_data %>%
-    dplyr::group_by(target_type, unexpressed_ctrl_flag) %>%
+    dplyr::group_by(target_type) %>%
     dplyr::summarize(median = median(lfc_adj, na.rm = TRUE))
 
   message("Calculating CRISPR score")
 
   lfc_df <- source_data %>%
-    dplyr::left_join(medians_df, by = c("target_type", "unexpressed_ctrl_flag")) %>%
+    dplyr::left_join(medians_df, by = c("target_type")) %>%
     dplyr::mutate(
       # Since the pgPEN library uses non-targeting controls, we adjusted for the
       # fact that single-targeting pgRNAs generate only two double-strand breaks

diff --git a/R/06-calculate_gi.R b/R/06-calculate_gi.R
@@ -1,5 +1,28 @@
 #' Calculate Genetic Interaction scores
 #' @description Create results table that has CRISPR scores, Wilcoxon rank-sum test and t tests.
+#' The output of the `gimap` package is genetic interaction scores which _is the
+#' distance between the observed CRISPR score and the expected CRISPR score._
+#' The expected CRISPR scores are what we expect for the CRISPR values should two
+#' genes be unrelated to each other. The further away an observed CRISPR score is
+#' from its expected the more we suspect genetic interaction.
+#' This can be true in a positive way (a CRISPR knockout pair caused more cell
+#' proliferation than expected) or in a negative way (a CRISPR knockout pair caused
+#' more cell lethality than expected).
+#'
+#' The genetic interaction scores are based on a linear model calculated for each sample where `observed_crispr_single` is the outcome variable and `expected_crispr_single` is the predictor variable.
+#' For each sample: lm(observed_crispr_single ~ expected_crispr_single)
+#'
+#' Using `y = mx+b`, we can fill in the following values:
+#' * `y` = observed CRISPR score
+#' * `x` = expected CRISPR score
+#' * `m` = slope from linear model for this sample
+#' * `b` = intercept from linear model for this sample
+#'
+#' The intercept and slope from this linear model are used to adjust the CRISPR scores for each sample:
+#' single target gi score = observed single crispr - (intercept + slope * expected single crispr)
+#' double_target_gi_score = double crispr score - (intercept + slope * expected double crispr)
+#' These single and double target genetic interaction scores are calculated at
+#' the construct level and are then summarized using a t-test to see if the the distribution of the set of double targeting constructs is significantly different than the overall distribution single targeting constructs. After multiple testing correction, FDR values are reported. Low FDR value for a double construct means high suspicion of paralogs.
 #' @param .data Data can be piped in with tidyverse pipes from function to function. But the data must still be a gimap_dataset
 #' @param gimap_dataset A special dataset structure that is setup using the `setup_data()` function.
 #' @export
@@ -23,11 +46,6 @@
 calc_gi <- function(.data = NULL,
                     gimap_dataset) {
 
-  #Controls
-  #Replicates
-  #Means
-
-
   # Summary the calculation
   # single_target_crispr_1 = geneA_nt1, geneA_nt2...
   # single_target_crispr_2 = nt1_geneB, nt2_geneB...
@@ -39,7 +57,7 @@ calc_gi <- function(.data = NULL,
   # expected_crispr_single_1 = single_target_crispr_1 + mean_double_control_crispr
   # expected_crispr_single_2 = single_target_crispr_2 + mean_double_control_crispr
 
-  # linear model is lm(mean_observed_single_crispr ~ mean_expected_single_crispr)
+  # linear model is lm(observed_single_crispr ~ expected_single_crispr)
 
   # single_target_gi_score = observed_single_crispr - (intercept + slope * expected_single_crispr)
   # double_target_gi_score = double_crispr_score - (intercept + slope * expected_double_crispr)
@@ -140,10 +158,7 @@ calc_gi <- function(.data = NULL,
   gimap_dataset$gi_scores <- all_gi_scores
 
   # Store this
-  gimap_dataset$results <-
-    list(
-      single_lm = single_lm_df
-    )
+  gimap_dataset$overall_results <- single_lm_df
 
   return(gimap_dataset)
 }

diff --git a/README.md b/README.md
@@ -1,6 +1,7 @@
 # gimap - Genetic Interaction MAPping for dual target CRISPR screens
 
 ## Background on paired guide CRISPR
+
 Some genes have "backup copies" - these are called paralog genes. Think of it like having a spare tire in your car - if one fails, the other can often do a similar job. This redundancy makes it tricky to study what these genes do, because if you knock out just one gene, its partner might pick up the slack.
 
 CRISPR allows genes to be knocked out so we can see what their function is. But because of paralogs it can be hard to parse out what genes are actually involved. So instead of just targeting one gene at a time, paired guide CRISPR screening, allows us to knockout two genes at a time.
@@ -10,18 +11,111 @@ It's particularly useful for understanding:
 - Which gene combinations are essential for cell survival
 
 ## What is gimap?
+
 gimap - is software tool that helps make sense of paired CRISPR screening data. Here's what it does:
-1. Takes data from paired CRISPR screens that has been pre-processed by the pgmap software. 
+1. Takes data from paired CRISPR screens that has been pre-processed by the pgmap software, or any counts table of paired gRNA reads
 2. The input data will have cell counts for how well cells grow (or don't grow) when different genes or pairs of genes are disabled
 3. gimap can take this data and helps identify interesting patterns, like:
    - When disabling two genes together is more devastating than you'd expect from disabling them individually (called synthetic lethality)
    - When genes work together cooperatively
-   - Which gene combinations might be important in diseases like various canceres
+   - Which gene combinations might be important in diseases like various cancers
 
-gimap can help find meaningful patterns in complex genetic experiments. It's particularly focused on analyzing data from the Berger Lab paired CRISPR screening library called pgPEN (paired guide RNAs for genetic interaction mapping).
+gimap can help find meaningful patterns in complex genetic experiments. It's particularly focused on analyzing data generated by screening cells with the Berger Lab paired gRNA CRISPR screening library called pgPEN (paired guide RNAs for genetic interaction mapping).
 
 The gimap package is based off of the original code and research from the Berger Lab stored in this repository: https://github.com/FredHutch/GI_mapping
 
+## About Genetic Interaction Scores
+
+The output of the `gimap` package is genetic interaction scores which _is the distance between the observed CRISPR score and the expected CRISPR score._ The expected CRISPR scores are what we expect for the CRISPR values should two genes be unrelated to each other. The further away an observed CRISPR score is from its expected the more we suspect genetic interaction.
+
+This can be true in a positive way (a CRISPR knockout pair caused more cell proliferation than expected -- called cooperativity) or in a negative way (a CRISPR knockout pair caused more cell lethality than expected).
+
+The genetic interaction scores are based on a linear model calculated for each sample where `observed_crispr_single` is the outcome variable and `expected_crispr_single` is the predictor variable.
+
+**For each sample:**
+```
+lm(observed_crispr_single ~ expected_crispr_single)
+```
+Using `y = mx+b`, we can fill in the following values:
+* `y` = observed CRISPR score
+* `x` = expected CRISPR score
+* `m` = slope from linear model for this sample
+* `b` = intercept from linear model for this sample
+
+The intercept and slope from this linear model are used to adjust the CRISPR scores for each sample:
+```
+single target gi score = observed single crispr - (intercept + slope * expected single crispr)
+double_target_gi_score = double crispr score - (intercept + slope * expected double crispr)
+```
+
+These single and double target genetic interaction scores are calculated at the construct level and are then summarized using a t-test to see if the the distribution of the set of double targeting constructs is significantly different than the overall distribution single targeting constructs. After multiple testing correction, FDR values are reported. Low FDR value for a double construct means high suspicion of paralogs.
+
+## pgPEN library design
+
+The `gimap` package is based off of an original paired CRISPR knockout library design from the Berger lab.
+
+There are four target types included in the custom Berger lab pgPEN library
+
+- `double_targeting` - these cells have two different genes that have been knocked out with pgRNAs. This is sometimes noted as `"gene_gene"`
+- `single_targeting` - these cells have one gene that have been knocked out with pgRNA and another that is designed to not target any gene. This is includes `"gene_ctrl"` and `"ctrl_gene"`.
+- `positive_control` - these cells have one **essential control** gene that has been knocked out with pgRNA and another that is designed to NOT target any genes. These are also noted as This is includes `"gene_ctrl"` and `"ctrl_gene"` but for when the gene is an essential gene, e.g. required in most cell lines for survival. 
+- `negative_control` - these cells have two pgRNAs designed to NOT target any genes. This is noted as `"ctrl_ctrl"`.
+
+In the instance of a single gene pair: e.g. `geneA_geneB`, there are 32 different constructs related to it. `32 = 16 single targeting and 16 double targeting`
+
+- There are 16 `double targeting`: `geneA_geneBpg1, geneA_geneBpg2, ... geneA_geneBpg16`.
+- 4 unique targeting sequences for `gene A` * 4 unique targeting sequences for `gene B` = 16 unique combos of double targeting constructs.
+- There are 16 `single targeting` in relation to this `geneA_geneB` construct there are: `geneA_ctrl` and `ctrl_geneB` sequences.
+- (4 unique targeting sequences (`"geneA"`) * 2 non targeting sequences (`"ctrl"`) = 8 constructs)
+- (2 non targeting sequences (`"ctrl"`) * 4 unique targeting sequences (`"geneB"`) = 8 constructs)
+
+#### Expected CRISPR scores
+
+Expected CRISPR scores are calculated based on the pgRNAs included.
+
+For `double_targeting` construct we would expect the CRISPR score to be the sum of the single targeting CRISPR scores which use the same pgRNA.
+```
+expected crispr double = single target crispr 1 + single target crispr 2
+```
+
+So in the instance of the double construct `GSN_SCIN_pg1` we would expect its CRISPR score to be the same as `GSN_ctrl` + `ctrl_SCIN` where the pgRNA used to target GSN and respectively are the same.
+
+For `single_targeting` constructs we would expect the CRISPR score to be the sum of a single target plus the mean of the control pgRNA from the `negative control constructs` (or double non-targeting CRISPR)
+
+```
+expected crispr single = single target crispr 1 + mean negative control for the pgRNA
+```
+So for `TRPC5_nt1` single target we would take its CRISPR score + the mean of the `nt1` sequence across the constructs where it is included in a double non targeting construct.
+
+### Normalization
+
+gimap takes in a counts matrix that represents the number of cells that have each type of pgRNA this data needs some normalization before CRISPR scores and Genetic Interaction scores can be calculated.
+
+There are four steps of normalization.
+1. `Calculate log2CPM` - First we account for different read depths across samples and transforms data to log2 counts per million reads.
+```
+log2((counts / total counts for sample)) * 1 million) + 1)
+```
+
+2. `Calculate log2 fold change` - This is done by subtracting the log2CPM for the pre-treatment from each sample.  control is what is highlighted. The pretreatment is the day 0 of CRISPR treatment, before CRISPR pgRNAs have taken effect.
+```
+log2FC = log2CPM for each sample - pretreament log2CPM
+```
+3. `Normalize by negative and positive controls` - Calculate a negative control median for each sample and a positive control median for each sample and divide each log2FC by this value.
+```
+log2FC adjusted = log2FC / (median negative control for a sample - median positive control for a sample)
+```
+
+### CRISPR scores
+
+Since the pgPEN library uses non-targeting controls, we adjust for the fact that single-targeting pgRNAs generate only two double-strand breaks (1 per allele), whereas the double-targeting pgRNAs generate four DSBs. To do this, we set the median LFC of each group to zero.
+
+Calculate medians of based on single and double targeting and subtract these medians from `log2FC adjusted`
+
+```
+crispr score = log2FC adjusted - median for each target type
+```
+
 ## Prerequisites
 
 In order to run this pipeline you will need R and to install the `gimap` package and its dependencies. In R you can run this to install the package:
@@ -41,7 +135,7 @@ We also have tutorial examples that show how to run timepoint or treatment exper
 
 Follow the steps there that will walk you through the example data. Then you can tailor that tutorial to use your own data.
 
-## Citations: 
+## Citations:
 - https://pubmed.ncbi.nlm.nih.gov/34469736/
 - https://www.nature.com/articles/s43586-021-00093-4
 

diff --git a/inst/rmd/gimap_dataset_final.RDS b/inst/rmd/gimap_dataset_final.RDS
diff --git a/man/calc_crispr.Rd b/man/calc_crispr.Rd