Skip to content

Commit

Permalink
Merge pull request #15 from nf-core/spot2cell
Browse files Browse the repository at this point in the history
Replacing PROJECT_SPOTS and MCQUANT by more optimized method
  • Loading branch information
FloWuenne authored Oct 25, 2023
2 parents 8313bf2 + 91d31bd commit 1822788
Show file tree
Hide file tree
Showing 15 changed files with 221 additions and 290 deletions.
13 changes: 13 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,19 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## v1.0.1dev - [2023.23.10]

- Replace `PROJECT_SPOTS` and `MCQUANT` modules with spot2cells. This new (for now local) module reduces the RAM requirements drastically, because it doesn't create a multi-channel stack for the spots. Spots are assigned by looking up cell IDs at x,y, positions and iterating over the deduplicated spots table.
- Added process labels to many modules to fix linting warnings
- Added meta map to molcart_qc output to remove linting warning -- adjusted script for multiqc input accordingly
- Added duplicated spots counts to collect_qc.py and multiqc_config.yml so that they also get counted.
- Added tag option to spot2cell so that the output names with same sample id and different segmentation methods can be differentiated (they were overwriting each other previously)
- removed project spots and mcquant from modules.config
- changed pattern for molcart_qc as it was not matching the files (removed {})
- added meta value to segmethod input in molcart_qc
- spot counts are now int values
- QC metrics rounded to 2 decimals

## v1.0.1dev - [2023.22.10]

Replaced the `clahe` param with `skip_clahe` so that the default value for running CLAHE is `False`.
Expand Down
3 changes: 3 additions & 0 deletions assets/multiqc_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ custom_data:
spot_assign_percent:
title: Percentage of spots assigned to cell
description: "% of spots assigned to cells"
duplicated_total:
title: Total number of duplicated spots in the area
description: "Total number of duplicated spots"
sp:
segmentation_stats:
fn: "final_QC.all_samples.csv"
Expand Down
38 changes: 22 additions & 16 deletions bin/collect_QC.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,50 +14,54 @@ def summarize_spots(spot_table):
## Calculate the total number of spots in spot_table
total_spots = spot_table.shape[0]

return (tx_per_gene, total_spots)
## Get list of genes
genes = spot_table["gene"].unique()

return (tx_per_gene, total_spots, genes)

def summarize_segmasks(mcquant, spots_summary):
## Calculate the total number of cells (rows) in mcquant
total_cells = mcquant.shape[0]

## Calculate the average segmentation area from column Area in mcquant
avg_area = mcquant["Area"].mean()
def summarize_segmasks(cellxgene_table, spots_summary):
## Calculate the total number of cells (rows) in cellxgene_table
total_cells = cellxgene_table.shape[0]

## Calculate the average segmentation area from column Area in cellxgene_table
avg_area = cellxgene_table["Area"].mean()

## Calculate the % of spots assigned
## Subset mcquant for all columns with _intensity_sum in the column name and sum the column values
spot_assign = mcquant.filter(regex="_intensity_sum").sum(axis=1)
## Subset cellxgene_table for all columns with _intensity_sum in the column name and sum the column values
spot_assign = cellxgene_table[spots_summary[2]].sum(axis=1)
spot_assign_total = int(sum(spot_assign))
spot_assign_per_cell = total_cells and spot_assign_total / total_cells or 0
spot_assign_per_cell = round(spot_assign_per_cell, 2)
# spot_assign_per_cell = spot_assign_total / total_cells
spot_assign_percent = spot_assign_total / spots_summary[1] * 100
spot_assign_percent = round(spot_assign_percent, 2)

return (total_cells, avg_area, spot_assign_per_cell, spot_assign_total, spot_assign_percent)


if __name__ == "__main__":
# Write an argparse with input options mcquant_in, spots and output options outdir, sample_id
# Write an argparse with input options cellxgene_table, spots and output options outdir, sample_id
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--mcquant", help="mcquant regionprops_table.")
parser.add_argument("-i", "--cellxgene", help="cellxgene regionprops_table.")
parser.add_argument("-s", "--spots", help="Resolve biosciences spot table.")
parser.add_argument("-o", "--outdir", help="Output directory.")

parser.add_argument("-d", "--sample_id", help="Sample ID.")

parser.add_argument("-g", "--segmentation_method", help="Segmentation method used.")

args = parser.parse_args()

## Read in mcquant table
mcquant = pd.read_csv(args.mcquant)
## Read in cellxgene_table table
cellxgene_table = pd.read_csv(args.cellxgene, sep=",")

## Read in spot table
spots = pd.read_table(args.spots, sep="\t", names=["x", "y", "z", "gene"])
duplicated = sum(spots.gene.str.contains("Duplicated"))
spots = spots[~spots.gene.str.contains("Duplicated")]

## Summarize spots table
summary_spots = summarize_spots(spots)
summary_segmentation = summarize_segmasks(mcquant, summary_spots)
summary_segmentation = summarize_segmasks(cellxgene_table, summary_spots)

## Create pandas data frame with one row per parameter and write each value in summary_segmentation to a new row in the data frame
summary_df = pd.DataFrame(
Expand All @@ -70,6 +74,7 @@ def summarize_segmasks(mcquant, spots_summary):
"spot_assign_per_cell",
"spot_assign_total",
"spot_assign_percent",
"duplicated_total",
]
)
summary_df.loc[0] = [
Expand All @@ -82,8 +87,9 @@ def summarize_segmasks(mcquant, spots_summary):
summary_segmentation[2],
summary_segmentation[3],
summary_segmentation[4],
duplicated,
]

print(args.sample_id)
# Write summary_df to a csv file
summary_df.to_csv(
f"{args.outdir}/{args.sample_id}.{args.segmentation_method}.spot_QC.csv", header=True, index=False
Expand Down
64 changes: 0 additions & 64 deletions bin/project_spots.dask.py

This file was deleted.

129 changes: 129 additions & 0 deletions bin/spot2cell.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
#!/usr/bin/env python

## Import packages
import pandas as pd
import numpy as np
from skimage.measure import regionprops_table
import tifffile as tiff
import argparse
import os


def assign_spots2cell(spot_table, cell_mask):
# Initialize a dictionary to hold the counts
gene_counts = {}

# Calculate cell properties for cell_mask using regionprops_table
cell_props = regionprops_table(
cell_mask,
properties=[
"label",
"centroid",
"area",
"major_axis_length",
"minor_axis_length",
"eccentricity",
"solidity",
"extent",
"orientation",
],
)

# Turn cell props into a pandas DataFrame and add a Cell_ID column
name_map = {
"CellID": "label",
"X_centroid": "centroid-1",
"Y_centroid": "centroid-0",
"Area": "area",
"MajorAxisLength": "major_axis_length",
"MinorAxisLength": "minor_axis_length",
"Eccentricity": "eccentricity",
"Solidity": "solidity",
"Extent": "extent",
"Orientation": "orientation",
}

for new_name, old_name in name_map.items():
cell_props[new_name] = cell_props[old_name]

for old_name in set(name_map.values()):
del cell_props[old_name]

cell_props = pd.DataFrame(cell_props)

# Exclude any rows that contain Duplicated in the gene column from spot_table
spot_table = spot_table[~spot_table["gene"].str.contains("Duplicated")]

# Iterate over each row in the grouped DataFrame
for index, row in spot_table.iterrows():
# Get the x and y positions and gene
x = int(row["x"])
y = int(row["y"])
gene = row["gene"]

# Get the cell ID from the labeled mask
cell_id = cell_mask[y, x]

# If the cell ID is not in the dictionary, add it
if cell_id not in gene_counts:
gene_counts[cell_id] = {}
if gene not in gene_counts[cell_id]:
gene_counts[cell_id][gene] = 1
else:
gene_counts[cell_id][gene] += 1
else:
if gene not in gene_counts[cell_id]:
gene_counts[cell_id][gene] = 1
else:
# Add the count for this gene in this cell ID
gene_counts[cell_id][gene] += 1

# Convert the dictionary of counts into a DataFrame
gene_counts_df = pd.DataFrame(gene_counts).T

# Add a column to gene_counts_df for the Cell_ID, make it the first column of the table
gene_counts_df["CellID"] = gene_counts_df.index

# Add the regionprops data from cell_props for each cell ID to gene_counts_df add NA when cell_ID exists in cell_props but not in gene_counts_df
gene_counts_df = gene_counts_df.merge(cell_props, on="CellID", how="outer")

# Convert NaN values to 0
gene_counts_df = gene_counts_df.fillna(0)

# Sort by Cell_ID in ascending order
gene_counts_df = gene_counts_df.sort_values(by=["CellID"])

# Make Cell_ID the first column in gene_counts_df
gene_counts_df = gene_counts_df.set_index("CellID").reset_index()

gene_counts_df[spot_table.gene.unique()] = gene_counts_df[spot_table.gene.unique()].astype(int)

# Filter out cell_ID = 0 into it's own dataframe called background
background = gene_counts_df[gene_counts_df["CellID"] == 0]
gene_counts_df = gene_counts_df[gene_counts_df["CellID"] != 0]

# Return both gene_counts_df and background
return gene_counts_df, background


if __name__ == "__main__":
# Add a python argument parser with options for input, output and image size in x and y
parser = argparse.ArgumentParser()
parser.add_argument("-s", "--spot_table", help="Spot table to project.")
parser.add_argument("-c", "--cell_mask", help="Sample ID.")
parser.add_argument("--tag", type=str, help="Additional tag to append to filename")

args = parser.parse_args()

## Read in spot table
spot_data = pd.read_csv(
args.spot_table, names=["x", "y", "z", "gene", "empty"], sep="\t", header=None, index_col=None
)

cell_mask = tiff.imread(args.cell_mask)

gene_counts_df, background = assign_spots2cell(spot_data, cell_mask)

basename = os.path.basename(args.spot_table)
basename = os.path.splitext(basename)[0]
gene_counts_df.to_csv(f"{basename}.{args.tag}.cellxgene.csv", sep=",", header=True, index=False)
15 changes: 1 addition & 14 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ process {
withName: 'MOLCART_QC' {
publishDir = [
path: { "${params.outdir}/molcart_qc" },
pattern: { "*.csv" }
pattern: "*.csv"
]
}

Expand Down Expand Up @@ -91,14 +91,6 @@ process {
].join(" ").trim()
}

withName: "PROJECT_SPOTS" {
memory = "16GB"
publishDir = [
path: "${params.outdir}/projectedspots",
pattern: "*.{tiff,csv}"
]
}

withName: "CLAHE_DASK" {
memory = "16GB"
ext.when = { !params.skip_clahe }
Expand Down Expand Up @@ -150,9 +142,4 @@ process {
saveAs: { filename -> "${meta.id}_cellpose_mask.tif" }
]
}

withName: "MCQUANT" {
ext.args = "--intensity_props intensity_sum"
}

}
5 changes: 0 additions & 5 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,6 @@
"git_sha": "6f150e1503c0826c21fedf1fa566cdbecbe98ec7",
"installed_by": ["modules"]
},
"mcquant": {
"branch": "master",
"git_sha": "b9829e1064382745d8dff7f1d74d2138d2864f71",
"installed_by": ["modules"]
},
"mindagap/mindagap": {
"branch": "master",
"git_sha": "240937a2a9c30298110753292be041188891f2cb",
Expand Down
1 change: 1 addition & 0 deletions modules/local/clahe_dask.nf
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
process CLAHE_DASK{
debug false
tag "Applying CLAHE to $meta.id"
label 'process_low'

container 'ghcr.io/schapirolabor/background_subtraction:v0.3.3'

Expand Down
1 change: 1 addition & 0 deletions modules/local/create_stack.nf
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
process CREATE_STACK {
tag "Stacking channels for $meta.id"
label 'process_medium'

container 'ghcr.io/schapirolabor/background_subtraction:v0.3.3'

Expand Down
Loading

0 comments on commit 1822788

Please sign in to comment.