Skip to content

Commit

Permalink
Merge pull request #49 from nf-core/anndata_output
Browse files Browse the repository at this point in the history
Added anndata as an output format
  • Loading branch information
FloWuenne authored Jan 5, 2024
2 parents 13fbef4 + e095a69 commit 252f68d
Show file tree
Hide file tree
Showing 19 changed files with 196 additions and 55 deletions.
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,20 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## v1.0.1dev - [2024.01.04]

### `Added`

- Added createanndata process to workflow. This process will generate a spatial anndata object from the spot2cell output. The anndata object will be written to /anndata in the output folder.
- added tests for createanndata

### `Fixed`

- Updated version numbers for all local modules using the molkart-local container to v0.0.3
- spot2cell - removed tag, output name now required, output name defined in modules.config
- output documentation for create training subset
- formatting in local modules

## v1.0.1dev - [2023.12.19]

### `Fixed`
Expand Down
4 changes: 4 additions & 0 deletions CITATIONS.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
## Pipeline tools

- [anndata](https://anndata.readthedocs.io/en/latest/)

> Isaac Virshup, Sergei Rybakov, Fabian J. Theis, Philipp Angerer, F. Alexander Wolf anndata: Annotated data > bioRxiv 2021.12.16.473007; doi: https://doi.org/10.1101/2021.12.16.473007
- [Cellpose](https://www.cellpose.org/)

> Stringer, C., Wang, T., Michaelos, M. et al. Cellpose: a generalist algorithm for cellular segmentation. Nat Methods 18, 100–106 (2021). https://doi.org/10.1038/s41592-020-01018-x
Expand Down
62 changes: 62 additions & 0 deletions bin/create_anndata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/usr/bin/env python
import pandas as pd
import numpy as np
from anndata import AnnData
import argparse
from argparse import ArgumentParser as AP
from os.path import abspath
import time
from scipy.sparse import csr_matrix


def get_args():
# Script description
description = """Anndata object creation"""

# Add parser
parser = AP(description=description, formatter_class=argparse.RawDescriptionHelpFormatter)

# Sections
inputs = parser.add_argument_group(title="Required Input", description="Path to required input file")
inputs.add_argument("-i", "--input", type=str, help="Path to the spot2cell csv file.")
inputs.add_argument("-s", "--spatial_cols", nargs="+", help="Column names for location data.")
inputs.add_argument(
"-o", "--output", dest="output", action="store", required=True, help="Path to output anndata object."
)
inputs.add_argument("--version", action="version", version="0.1.0")
arg = parser.parse_args()
arg.input = abspath(arg.input)
arg.output = abspath(arg.output)
return arg


def create_spatial_anndata(input, spatial_cols):
df = pd.read_csv(input)
spatial_coords = np.array(df[args.spatial_cols].values.tolist())
# Find the index of 'Y_centroid' column
y_centroid_index = df.columns.get_loc("X_centroid")
# Create a list of all columns from 'Y_centroid' to the end
metadata_cols = df.columns[y_centroid_index:]
# Extract the excluded columns as metadata
metadata = df[metadata_cols]

count_table = csr_matrix(df.drop(list(metadata_cols), axis=1).values.tolist())
adata = AnnData(count_table, obsm={"spatial": spatial_coords})
# Add the metadata to adata.obs
for col in metadata.columns:
adata.obs[col] = metadata[col].values
adata.obs_names = [f"Cell_{i:d}" for i in range(adata.n_obs)]
return adata


def main(args):
adata = create_spatial_anndata(args.input, args.spatial_cols)
adata.write(args.output)


if __name__ == "__main__":
args = get_args()
st = time.time()
main(args)
rt = time.time() - st
print(f"Script finished in {rt // 60:.0f}m {rt % 60:.0f}s")
10 changes: 1 addition & 9 deletions bin/spot2cell.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,6 @@ def assign_spots2cell(spot_table, cell_mask):
parser = argparse.ArgumentParser()
parser.add_argument("-s", "--spot_table", help="Spot table to project.")
parser.add_argument("-c", "--cell_mask", help="Sample ID.")
parser.add_argument("--tag", type=str, help="Additional tag to append to filename")
parser.add_argument("--output", type=str, help="Output path")
parser.add_argument("--version", action="version", version="0.1.0")

Expand All @@ -126,11 +125,4 @@ def assign_spots2cell(spot_table, cell_mask):

gene_counts_df, background = assign_spots2cell(spot_data, cell_mask)

if args.output:
outpath = args.output

else:
basename = os.path.basename(args.spot_table)
basename = os.path.splitext(basename)[0]
outpath = f"{basename}.{args.tag}.cellxgene.csv"
gene_counts_df.to_csv(outpath, sep=",", header=True, index=False)
gene_counts_df.to_csv(args.output, sep=",", header=True, index=False)
11 changes: 10 additions & 1 deletion conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ process {
}

withName: "SPOT2CELL" {
ext.prefix = { "${meta.id}_${tag}"}
ext.prefix = { "${meta.id}_${meta.segmentation}"}
publishDir = [
path: { "${params.outdir}/spot2cell" },
pattern: "*.csv",
Expand Down Expand Up @@ -211,4 +211,13 @@ process {
saveAs: { filename -> "${meta.id}_cellpose_mask.tif" }
]
}

withName: "CREATE_ANNDATA" {
ext.prefix = { "${meta.id}_${meta.segmentation}"}
publishDir = [
path: "${params.outdir}/anndata",
mode: params.publish_dir_mode,
pattern: "*.{adata}"
]
}
}
37 changes: 25 additions & 12 deletions docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
- [segmentation](#segmentation) - Segment single cells from provided image using segmentation method of choice (Cellpose, Mesmer, ilastik) and filter them by size.
- [Mindagap_duplicatefinder](#Mindagap) - Take a spot table and search for duplicates along grid lines.
- [Spot2cell](#spot2cell) - Assign non-duplicated spots to segmented cells based on segmentation mask and extract cell shape information.
- [Create AnnData](#anndata) - Creates a spatial AnnData object as described in the [Squidpy tutorial](https://squidpy.readthedocs.io/en/stable/notebooks/tutorials/tutorial_read_spatial.html).
- [MolkartQC](#molkartqc) - Produce QC metrics specific to this pipeline.
- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline.
- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution.
Expand Down Expand Up @@ -93,6 +94,18 @@ Create stack is a local module used to merge images into a stack as preparation

Spot2cell is a local module that assigns spots (without Duplicates) to cells via a spot table and segmentation mask.

### Create_anndata

<details markdown="1">
<summary>Output files</summary>

- `anndata/`
- `*.adata`: Anndata object containing the spot count table, spatial locations of cells in `adata.obsm` and metadata like 'Area', 'MajorAxisLength', 'MinorAxisLength', 'Eccentricity', 'Solidity', 'Extent', 'Orientation' in `adata.obs`

</details>

CREATE_ANNDATA is a local module that generates an [AnnData object](https://anndata.readthedocs.io/en/latest/) storing expression, metadata and spatial locations of cells.

### MolkartQC

<details markdown="1">
Expand Down Expand Up @@ -123,17 +136,6 @@ MolkartQC is a local module used for gathering useful quality-control metrics fo

Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see <http://multiqc.info>.

### Pipeline information

<details markdown="1">
<summary>Output files</summary>

- `pipeline_info/`
- Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`.
- Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameters are used when running the pipeline.
- Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`.
- Parameters used by the pipeline run: `params.json`.

### create-training-subset

<details markdown="1">
Expand All @@ -147,7 +149,18 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ

</details>

Spot2cell is a local module that assigns spots (without Duplicates) to cells via a spot table and segmentation mask.
Create training subset is an optional group of modules that create crops in `hdf5` and `tiff` formats, as well as provide the crop overview for reusability.

### Pipeline information

<details markdown="1">
<summary>Output files</summary>

- `pipeline_info/`
- Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`.
- Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameters are used when running the pipeline.
- Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`.
- Parameters used by the pipeline run: `params.json`.

</details>

Expand Down
4 changes: 2 additions & 2 deletions modules/local/clahe.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ process CLAHE{
tag "$meta.id"
label 'process_medium'

container 'ghcr.io/schapirolabor/molkart-local:v0.0.1'
container 'ghcr.io/schapirolabor/molkart-local:v0.0.3'

input:
tuple val(meta), path(image)
Expand All @@ -15,7 +15,7 @@ process CLAHE{
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
"""
apply_clahe.dask.py \\
Expand Down
33 changes: 33 additions & 0 deletions modules/local/createanndata.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
process CREATE_ANNDATA {
tag "$meta.id"
label 'process_low'

container 'ghcr.io/schapirolabor/molkart-local:v0.0.3'

input:
tuple val(meta), path(spot2cell)

output:
tuple val(meta), path("*.adata") , emit: stack
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"

"""
create_anndata.py \\
--input ${spot2cell} \\
--spatial_cols X_centroid Y_centroid \\
--output ${prefix}.adata \\
$args
cat <<-END_VERSIONS > versions.yml
"${task.process}":
molkart_createanndata: \$(create_anndata.py --version)
END_VERSIONS
"""
}
8 changes: 4 additions & 4 deletions modules/local/createstack.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,20 @@ process CREATE_STACK {
tag "$meta.id"
label 'process_low'

container 'ghcr.io/schapirolabor/molkart-local:v0.0.1'
container 'ghcr.io/schapirolabor/molkart-local:v0.0.3'

input:
tuple val(meta), path(image)

output:
tuple val(meta), path("*.ome.tif"), emit: stack
path "versions.yml" , emit: versions
tuple val(meta), path("*.ome.tif") , emit: stack
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"

"""
Expand Down
2 changes: 1 addition & 1 deletion modules/local/crophdf5.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ process CROPHDF5 {
tag "$meta.id"
label 'process_single'

container 'ghcr.io/schapirolabor/molkart-local:v0.0.1'
container 'ghcr.io/schapirolabor/molkart-local:v0.0.3'

input:
tuple val(meta), path(image_stack), val(num_channels)
Expand Down
4 changes: 2 additions & 2 deletions modules/local/croptiff.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ process CROPTIFF {
tag "$meta.id"
label 'process_single'

container 'ghcr.io/schapirolabor/molkart-local:v0.0.1'
container 'ghcr.io/schapirolabor/molkart-local:v0.0.3'

input:
tuple val(meta), path(image_stack)
Expand All @@ -17,7 +17,7 @@ process CROPTIFF {
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"

"""
Expand Down
4 changes: 2 additions & 2 deletions modules/local/maskfilter.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ process MASKFILTER {
tag "$meta.id"
label 'process_medium'

container 'ghcr.io/schapirolabor/molkart-local:v0.0.1'
container 'ghcr.io/schapirolabor/molkart-local:v0.0.3'

input:
tuple val(meta), path(mask)
Expand All @@ -16,7 +16,7 @@ process MASKFILTER {
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"

"""
Expand Down
4 changes: 2 additions & 2 deletions modules/local/molkartqc.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ process MOLKARTQC{
tag "$meta.id"
label 'process_single'

container 'ghcr.io/schapirolabor/molkart-local:v0.0.1'
container 'ghcr.io/schapirolabor/molkart-local:v0.0.3'

input:
tuple val(meta), path(spot_table), path(cellxgene_table), val(segmethod), path(filterqc)
Expand All @@ -15,7 +15,7 @@ process MOLKARTQC{
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"

"""
Expand Down
2 changes: 1 addition & 1 deletion modules/local/molkartqcpng.nf
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
process MOLKARTQCPNG {
label 'process_single'

container 'ghcr.io/schapirolabor/molkart-local:v0.0.1'
container 'ghcr.io/schapirolabor/molkart-local:v0.0.3'

input:
path(png)
Expand Down
6 changes: 2 additions & 4 deletions modules/local/spot2cell.nf
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,11 @@ process SPOT2CELL{
tag "$meta.id"
label 'process_single'

container 'ghcr.io/schapirolabor/molkart-local:v0.0.1'
container 'ghcr.io/schapirolabor/molkart-local:v0.0.3'

input:
tuple val(meta) , path(spot_table)
tuple val(meta2), path(cell_mask)
val(tag)

output:
tuple val(meta), path("*.csv"), emit: cellxgene_table
Expand All @@ -18,14 +17,13 @@ process SPOT2CELL{
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"

"""
spot2cell.py \\
--spot_table ${spot_table} \\
--cell_mask ${cell_mask} \\
--tag ${tag} \\
--output ${prefix}.csv \\
$args
Expand Down
Loading

0 comments on commit 252f68d

Please sign in to comment.