nf-core · FloWuenne · Oct 25, 2023 · Oct 23, 2023 · Oct 23, 2023 · Oct 24, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,19 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## v1.0.1dev - [2023.23.10]
+
+- Replace `PROJECT_SPOTS` and `MCQUANT` modules with spot2cells. This new (for now local) module reduces the RAM requirements drastically, because it doesn't create a multi-channel stack for the spots. Spots are assigned by looking up cell IDs at x,y, positions and iterating over the deduplicated spots table.
+- Added process labels to many modules to fix linting warnings
+- Added meta map to molcart_qc output to remove linting warning -- adjusted script for multiqc input accordingly
+- Added duplicated spots counts to collect_qc.py and multiqc_config.yml so that they also get counted.
+- Added tag option to spot2cell so that the output names with same sample id and different segmentation methods can be differentiated (they were overwriting each other previously)
+- removed project spots and mcquant from modules.config
+- changed pattern for molcart_qc as it was not matching the files (removed {})
+- added meta value to segmethod input in molcart_qc
+- spot counts are now int values
+- QC metrics rounded to 2 decimals
+
 ## v1.0.1dev - [2023.22.10]
 
 Replaced the `clahe` param with `skip_clahe` so that the default value for running CLAHE is `False`.

diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
@@ -47,6 +47,9 @@ custom_data:
       spot_assign_percent:
         title: Percentage of spots assigned to cell
         description: "% of spots assigned to cells"
+      duplicated_total:
+        title: Total number of duplicated spots in the area
+        description: "Total number of duplicated spots"
 sp:
   segmentation_stats:
     fn: "final_QC.all_samples.csv"

diff --git a/bin/collect_QC.py b/bin/collect_QC.py
@@ -14,50 +14,54 @@ def summarize_spots(spot_table):
     ## Calculate the total number of spots in spot_table
     total_spots = spot_table.shape[0]
 
-    return (tx_per_gene, total_spots)
+    ## Get list of genes
+    genes = spot_table["gene"].unique()
 
+    return (tx_per_gene, total_spots, genes)
 
-def summarize_segmasks(mcquant, spots_summary):
-    ## Calculate the total number of cells (rows) in mcquant
-    total_cells = mcquant.shape[0]
 
-    ## Calculate the average segmentation area from column Area in mcquant
-    avg_area = mcquant["Area"].mean()
+def summarize_segmasks(cellxgene_table, spots_summary):
+    ## Calculate the total number of cells (rows) in cellxgene_table
+    total_cells = cellxgene_table.shape[0]
+
+    ## Calculate the average segmentation area from column Area in cellxgene_table
+    avg_area = cellxgene_table["Area"].mean()
 
     ## Calculate the % of spots assigned
-    ## Subset mcquant for all columns with _intensity_sum in the column name and sum the column values
-    spot_assign = mcquant.filter(regex="_intensity_sum").sum(axis=1)
+    ## Subset cellxgene_table for all columns with _intensity_sum in the column name and sum the column values
+    spot_assign = cellxgene_table[spots_summary[2]].sum(axis=1)
     spot_assign_total = int(sum(spot_assign))
     spot_assign_per_cell = total_cells and spot_assign_total / total_cells or 0
+    spot_assign_per_cell = round(spot_assign_per_cell, 2)
     # spot_assign_per_cell = spot_assign_total / total_cells
     spot_assign_percent = spot_assign_total / spots_summary[1] * 100
+    spot_assign_percent = round(spot_assign_percent, 2)
 
     return (total_cells, avg_area, spot_assign_per_cell, spot_assign_total, spot_assign_percent)
 
 
 if __name__ == "__main__":
-    # Write an argparse with input options mcquant_in, spots and output options outdir, sample_id
+    # Write an argparse with input options cellxgene_table, spots and output options outdir, sample_id
     parser = argparse.ArgumentParser()
-    parser.add_argument("-i", "--mcquant", help="mcquant regionprops_table.")
+    parser.add_argument("-i", "--cellxgene", help="cellxgene regionprops_table.")
     parser.add_argument("-s", "--spots", help="Resolve biosciences spot table.")
     parser.add_argument("-o", "--outdir", help="Output directory.")
-
     parser.add_argument("-d", "--sample_id", help="Sample ID.")
-
     parser.add_argument("-g", "--segmentation_method", help="Segmentation method used.")
 
     args = parser.parse_args()
 
-    ## Read in mcquant table
-    mcquant = pd.read_csv(args.mcquant)
+    ## Read in cellxgene_table table
+    cellxgene_table = pd.read_csv(args.cellxgene, sep=",")
 
     ## Read in spot table
     spots = pd.read_table(args.spots, sep="\t", names=["x", "y", "z", "gene"])
+    duplicated = sum(spots.gene.str.contains("Duplicated"))
     spots = spots[~spots.gene.str.contains("Duplicated")]
 
     ## Summarize spots table
     summary_spots = summarize_spots(spots)
-    summary_segmentation = summarize_segmasks(mcquant, summary_spots)
+    summary_segmentation = summarize_segmasks(cellxgene_table, summary_spots)
 
     ## Create pandas data frame with one row per parameter and write each value in summary_segmentation to a new row in the data frame
     summary_df = pd.DataFrame(
@@ -70,6 +74,7 @@ def summarize_segmasks(mcquant, spots_summary):
             "spot_assign_per_cell",
             "spot_assign_total",
             "spot_assign_percent",
+            "duplicated_total",
         ]
     )
     summary_df.loc[0] = [
@@ -82,8 +87,9 @@ def summarize_segmasks(mcquant, spots_summary):
         summary_segmentation[2],
         summary_segmentation[3],
         summary_segmentation[4],
+        duplicated,
     ]
-
+    print(args.sample_id)
     # Write summary_df to a csv file
     summary_df.to_csv(
         f"{args.outdir}/{args.sample_id}.{args.segmentation_method}.spot_QC.csv", header=True, index=False

diff --git a/bin/project_spots.dask.py b/bin/project_spots.dask.py
diff --git a/bin/spot2cell.py b/bin/spot2cell.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python
+
+## Import packages
+import pandas as pd
+import numpy as np
+from skimage.measure import regionprops_table
+import tifffile as tiff
+import argparse
+import os
+
+
+def assign_spots2cell(spot_table, cell_mask):
+    # Initialize a dictionary to hold the counts
+    gene_counts = {}
+
+    # Calculate cell properties for cell_mask using regionprops_table
+    cell_props = regionprops_table(
+        cell_mask,
+        properties=[
+            "label",
+            "centroid",
+            "area",
+            "major_axis_length",
+            "minor_axis_length",
+            "eccentricity",
+            "solidity",
+            "extent",
+            "orientation",
+        ],
+    )
+
+    # Turn cell props into a pandas DataFrame and add a Cell_ID column
+    name_map = {
+        "CellID": "label",
+        "X_centroid": "centroid-1",
+        "Y_centroid": "centroid-0",
+        "Area": "area",
+        "MajorAxisLength": "major_axis_length",
+        "MinorAxisLength": "minor_axis_length",
+        "Eccentricity": "eccentricity",
+        "Solidity": "solidity",
+        "Extent": "extent",
+        "Orientation": "orientation",
+    }
+
+    for new_name, old_name in name_map.items():
+        cell_props[new_name] = cell_props[old_name]
+
+    for old_name in set(name_map.values()):
+        del cell_props[old_name]
+
+    cell_props = pd.DataFrame(cell_props)
+
+    # Exclude any rows that contain Duplicated in the gene column from spot_table
+    spot_table = spot_table[~spot_table["gene"].str.contains("Duplicated")]
+
+    # Iterate over each row in the grouped DataFrame
+    for index, row in spot_table.iterrows():
+        # Get the x and y positions and gene
+        x = int(row["x"])
+        y = int(row["y"])
+        gene = row["gene"]
+
+        # Get the cell ID from the labeled mask
+        cell_id = cell_mask[y, x]
+
+        # If the cell ID is not in the dictionary, add it
+        if cell_id not in gene_counts:
+            gene_counts[cell_id] = {}
+            if gene not in gene_counts[cell_id]:
+                gene_counts[cell_id][gene] = 1
+            else:
+                gene_counts[cell_id][gene] += 1
+        else:
+            if gene not in gene_counts[cell_id]:
+                gene_counts[cell_id][gene] = 1
+            else:
+                # Add the count for this gene in this cell ID
+                gene_counts[cell_id][gene] += 1
+
+    # Convert the dictionary of counts into a DataFrame
+    gene_counts_df = pd.DataFrame(gene_counts).T
+
+    # Add a column to gene_counts_df for the Cell_ID, make it the first column of the table
+    gene_counts_df["CellID"] = gene_counts_df.index
+
+    # Add the regionprops data from cell_props for each cell ID to gene_counts_df add NA when cell_ID exists in cell_props but not in gene_counts_df
+    gene_counts_df = gene_counts_df.merge(cell_props, on="CellID", how="outer")
+
+    # Convert NaN values to 0
+    gene_counts_df = gene_counts_df.fillna(0)
+
+    # Sort by Cell_ID in ascending order
+    gene_counts_df = gene_counts_df.sort_values(by=["CellID"])
+
+    # Make Cell_ID the first column in gene_counts_df
+    gene_counts_df = gene_counts_df.set_index("CellID").reset_index()
+
+    gene_counts_df[spot_table.gene.unique()] = gene_counts_df[spot_table.gene.unique()].astype(int)
+
+    # Filter out cell_ID = 0 into it's own dataframe called background
+    background = gene_counts_df[gene_counts_df["CellID"] == 0]
+    gene_counts_df = gene_counts_df[gene_counts_df["CellID"] != 0]
+
+    # Return both gene_counts_df and background
+    return gene_counts_df, background
+
+
+if __name__ == "__main__":
+    # Add a python argument parser with options for input, output and image size in x and y
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-s", "--spot_table", help="Spot table to project.")
+    parser.add_argument("-c", "--cell_mask", help="Sample ID.")
+    parser.add_argument("--tag", type=str, help="Additional tag to append to filename")
+
+    args = parser.parse_args()
+
+    ## Read in spot table
+    spot_data = pd.read_csv(
+        args.spot_table, names=["x", "y", "z", "gene", "empty"], sep="\t", header=None, index_col=None
+    )
+
+    cell_mask = tiff.imread(args.cell_mask)
+
+    gene_counts_df, background = assign_spots2cell(spot_data, cell_mask)
+
+    basename = os.path.basename(args.spot_table)
+    basename = os.path.splitext(basename)[0]
+    gene_counts_df.to_csv(f"{basename}.{args.tag}.cellxgene.csv", sep=",", header=True, index=False)
diff --git a/conf/modules.config b/conf/modules.config
@@ -38,7 +38,7 @@ process {
     withName: 'MOLCART_QC' {
         publishDir = [
             path: { "${params.outdir}/molcart_qc" },
-            pattern: { "*.csv" }
+            pattern: "*.csv"
         ]
     }
 
@@ -91,14 +91,6 @@ process {
         ].join(" ").trim()
     }
 
-    withName: "PROJECT_SPOTS" {
-        memory     = "16GB"
-        publishDir = [
-            path: "${params.outdir}/projectedspots",
-            pattern: "*.{tiff,csv}"
-        ]
-    }
-
     withName: "CLAHE_DASK" {
         memory    = "16GB"
         ext.when  = { !params.skip_clahe }
@@ -150,9 +142,4 @@ process {
             saveAs: { filename -> "${meta.id}_cellpose_mask.tif" }
         ]
     }
-
-    withName: "MCQUANT" {
-        ext.args = "--intensity_props intensity_sum"
-    }
-
 }
diff --git a/modules.json b/modules.json
@@ -30,11 +30,6 @@
                         "git_sha": "6f150e1503c0826c21fedf1fa566cdbecbe98ec7",
                         "installed_by": ["modules"]
                     },
-                    "mcquant": {
-                        "branch": "master",
-                        "git_sha": "b9829e1064382745d8dff7f1d74d2138d2864f71",
-                        "installed_by": ["modules"]
-                    },
                     "mindagap/mindagap": {
                         "branch": "master",
                         "git_sha": "240937a2a9c30298110753292be041188891f2cb",

diff --git a/modules/local/clahe_dask.nf b/modules/local/clahe_dask.nf
@@ -1,6 +1,7 @@
 process CLAHE_DASK{
     debug false
     tag "Applying CLAHE to $meta.id"
+    label 'process_low'
 
     container 'ghcr.io/schapirolabor/background_subtraction:v0.3.3'
 

diff --git a/modules/local/create_stack.nf b/modules/local/create_stack.nf
@@ -1,5 +1,6 @@
 process CREATE_STACK {
     tag "Stacking channels for $meta.id"
+    label 'process_medium'
 
     container 'ghcr.io/schapirolabor/background_subtraction:v0.3.3'