Merge pull request #49 from nf-core/anndata_output

Added anndata as an output format
nf-core · Jan 5, 2024 · 252f68d · 252f68d
2 parents 13fbef4 + e095a69
commit 252f68d
Show file tree

Hide file tree

Showing 19 changed files with 196 additions and 55 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,20 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## v1.0.1dev - [2024.01.04]
+
+### `Added`
+
+- Added createanndata process to workflow. This process will generate a spatial anndata object from the spot2cell output. The anndata object will be written to /anndata in the output folder.
+- added tests for createanndata
+
+### `Fixed`
+
+- Updated version numbers for all local modules using the molkart-local container to v0.0.3
+- spot2cell - removed tag, output name now required, output name defined in modules.config
+- output documentation for create training subset
+- formatting in local modules
+
 ## v1.0.1dev - [2023.12.19]
 
 ### `Fixed`

diff --git a/CITATIONS.md b/CITATIONS.md
@@ -10,6 +10,10 @@
 
 ## Pipeline tools
 
+- [anndata](https://anndata.readthedocs.io/en/latest/)
+
+  > Isaac Virshup, Sergei Rybakov, Fabian J. Theis, Philipp Angerer, F. Alexander Wolf anndata: Annotated data > bioRxiv 2021.12.16.473007; doi: https://doi.org/10.1101/2021.12.16.473007
+
 - [Cellpose](https://www.cellpose.org/)
 
   > Stringer, C., Wang, T., Michaelos, M. et al. Cellpose: a generalist algorithm for cellular segmentation. Nat Methods 18, 100–106 (2021). https://doi.org/10.1038/s41592-020-01018-x

diff --git a/bin/create_anndata.py b/bin/create_anndata.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+import pandas as pd
+import numpy as np
+from anndata import AnnData
+import argparse
+from argparse import ArgumentParser as AP
+from os.path import abspath
+import time
+from scipy.sparse import csr_matrix
+
+
+def get_args():
+    # Script description
+    description = """Anndata object creation"""
+
+    # Add parser
+    parser = AP(description=description, formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    # Sections
+    inputs = parser.add_argument_group(title="Required Input", description="Path to required input file")
+    inputs.add_argument("-i", "--input", type=str, help="Path to the spot2cell csv file.")
+    inputs.add_argument("-s", "--spatial_cols", nargs="+", help="Column names for location data.")
+    inputs.add_argument(
+        "-o", "--output", dest="output", action="store", required=True, help="Path to output anndata object."
+    )
+    inputs.add_argument("--version", action="version", version="0.1.0")
+    arg = parser.parse_args()
+    arg.input = abspath(arg.input)
+    arg.output = abspath(arg.output)
+    return arg
+
+
+def create_spatial_anndata(input, spatial_cols):
+    df = pd.read_csv(input)
+    spatial_coords = np.array(df[args.spatial_cols].values.tolist())
+    # Find the index of 'Y_centroid' column
+    y_centroid_index = df.columns.get_loc("X_centroid")
+    # Create a list of all columns from 'Y_centroid' to the end
+    metadata_cols = df.columns[y_centroid_index:]
+    # Extract the excluded columns as metadata
+    metadata = df[metadata_cols]
+
+    count_table = csr_matrix(df.drop(list(metadata_cols), axis=1).values.tolist())
+    adata = AnnData(count_table, obsm={"spatial": spatial_coords})
+    # Add the metadata to adata.obs
+    for col in metadata.columns:
+        adata.obs[col] = metadata[col].values
+    adata.obs_names = [f"Cell_{i:d}" for i in range(adata.n_obs)]
+    return adata
+
+
+def main(args):
+    adata = create_spatial_anndata(args.input, args.spatial_cols)
+    adata.write(args.output)
+
+
+if __name__ == "__main__":
+    args = get_args()
+    st = time.time()
+    main(args)
+    rt = time.time() - st
+    print(f"Script finished in {rt // 60:.0f}m {rt % 60:.0f}s")
diff --git a/bin/spot2cell.py b/bin/spot2cell.py
@@ -111,7 +111,6 @@ def assign_spots2cell(spot_table, cell_mask):
     parser = argparse.ArgumentParser()
     parser.add_argument("-s", "--spot_table", help="Spot table to project.")
     parser.add_argument("-c", "--cell_mask", help="Sample ID.")
-    parser.add_argument("--tag", type=str, help="Additional tag to append to filename")
     parser.add_argument("--output", type=str, help="Output path")
     parser.add_argument("--version", action="version", version="0.1.0")
 
@@ -126,11 +125,4 @@ def assign_spots2cell(spot_table, cell_mask):
 
     gene_counts_df, background = assign_spots2cell(spot_data, cell_mask)
 
-    if args.output:
-        outpath = args.output
-
-    else:
-        basename = os.path.basename(args.spot_table)
-        basename = os.path.splitext(basename)[0]
-        outpath = f"{basename}.{args.tag}.cellxgene.csv"
-    gene_counts_df.to_csv(outpath, sep=",", header=True, index=False)
+    gene_counts_df.to_csv(args.output, sep=",", header=True, index=False)
diff --git a/conf/modules.config b/conf/modules.config
@@ -101,7 +101,7 @@ process {
     }
 
     withName: "SPOT2CELL" {
-        ext.prefix = { "${meta.id}_${tag}"}
+        ext.prefix = { "${meta.id}_${meta.segmentation}"}
         publishDir = [
             path: { "${params.outdir}/spot2cell" },
             pattern: "*.csv",
@@ -211,4 +211,13 @@ process {
             saveAs: { filename -> "${meta.id}_cellpose_mask.tif" }
         ]
     }
+
+    withName: "CREATE_ANNDATA" {
+        ext.prefix = { "${meta.id}_${meta.segmentation}"}
+        publishDir = [
+            path: "${params.outdir}/anndata",
+            mode: params.publish_dir_mode,
+            pattern: "*.{adata}"
+        ]
+    }
 }
diff --git a/docs/output.md b/docs/output.md
@@ -16,6 +16,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 - [segmentation](#segmentation) - Segment single cells from provided image using segmentation method of choice (Cellpose, Mesmer, ilastik) and filter them by size.
 - [Mindagap_duplicatefinder](#Mindagap) - Take a spot table and search for duplicates along grid lines.
 - [Spot2cell](#spot2cell) - Assign non-duplicated spots to segmented cells based on segmentation mask and extract cell shape information.
+- [Create AnnData](#anndata) - Creates a spatial AnnData object as described in the [Squidpy tutorial](https://squidpy.readthedocs.io/en/stable/notebooks/tutorials/tutorial_read_spatial.html).
 - [MolkartQC](#molkartqc) - Produce QC metrics specific to this pipeline.
 - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline.
 - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution.
@@ -93,6 +94,18 @@ Create stack is a local module used to merge images into a stack as preparation
 
 Spot2cell is a local module that assigns spots (without Duplicates) to cells via a spot table and segmentation mask.
 
+### Create_anndata
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `anndata/`
+  - `*.adata`: Anndata object containing the spot count table, spatial locations of cells in `adata.obsm` and metadata like 'Area', 'MajorAxisLength', 'MinorAxisLength', 'Eccentricity', 'Solidity', 'Extent', 'Orientation' in `adata.obs`
+
+</details>
+
+CREATE_ANNDATA is a local module that generates an [AnnData object](https://anndata.readthedocs.io/en/latest/) storing expression, metadata and spatial locations of cells.
+
 ### MolkartQC
 
 <details markdown="1">
@@ -123,17 +136,6 @@ MolkartQC is a local module used for gathering useful quality-control metrics fo
 
 Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see <http://multiqc.info>.
 
-### Pipeline information
-
-<details markdown="1">
-<summary>Output files</summary>
-
-- `pipeline_info/`
-  - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`.
-  - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameters are used when running the pipeline.
-  - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`.
-  - Parameters used by the pipeline run: `params.json`.
-
 ### create-training-subset
 
 <details markdown="1">
@@ -147,7 +149,18 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ
 
 </details>
 
-Spot2cell is a local module that assigns spots (without Duplicates) to cells via a spot table and segmentation mask.
+Create training subset is an optional group of modules that create crops in `hdf5` and `tiff` formats, as well as provide the crop overview for reusability.
+
+### Pipeline information
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `pipeline_info/`
+  - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`.
+  - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameters are used when running the pipeline.
+  - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`.
+  - Parameters used by the pipeline run: `params.json`.
 
 </details>
 

diff --git a/modules/local/clahe.nf b/modules/local/clahe.nf
@@ -2,7 +2,7 @@ process CLAHE{
     tag "$meta.id"
     label 'process_medium'
 
-    container 'ghcr.io/schapirolabor/molkart-local:v0.0.1'
+    container 'ghcr.io/schapirolabor/molkart-local:v0.0.3'
 
     input:
     tuple val(meta), path(image)
@@ -15,7 +15,7 @@ process CLAHE{
     task.ext.when == null || task.ext.when
 
     script:
-    def args = task.ext.args ?: ''
+    def args   = task.ext.args   ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
     """
     apply_clahe.dask.py \\

diff --git a/modules/local/createanndata.nf b/modules/local/createanndata.nf
@@ -0,0 +1,33 @@
+process CREATE_ANNDATA {
+    tag "$meta.id"
+    label 'process_low'
+
+    container 'ghcr.io/schapirolabor/molkart-local:v0.0.3'
+
+    input:
+    tuple val(meta), path(spot2cell)
+
+    output:
+    tuple val(meta), path("*.adata") , emit: stack
+    path "versions.yml"              , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args   = task.ext.args   ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+
+    """
+    create_anndata.py \\
+        --input ${spot2cell} \\
+        --spatial_cols X_centroid Y_centroid \\
+        --output ${prefix}.adata \\
+        $args
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        molkart_createanndata: \$(create_anndata.py --version)
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/createstack.nf b/modules/local/createstack.nf
@@ -2,20 +2,20 @@ process CREATE_STACK {
     tag "$meta.id"
     label 'process_low'
 
-    container 'ghcr.io/schapirolabor/molkart-local:v0.0.1'
+    container 'ghcr.io/schapirolabor/molkart-local:v0.0.3'
 
     input:
     tuple val(meta), path(image)
 
     output:
-    tuple val(meta), path("*.ome.tif"), emit: stack
-    path "versions.yml"               , emit: versions
+    tuple val(meta), path("*.ome.tif") , emit: stack
+    path "versions.yml"                , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
 
     script:
-    def args = task.ext.args ?: ''
+    def args   = task.ext.args   ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
 
     """

diff --git a/modules/local/crophdf5.nf b/modules/local/crophdf5.nf
@@ -2,7 +2,7 @@ process CROPHDF5 {
     tag "$meta.id"
     label 'process_single'
 
-    container 'ghcr.io/schapirolabor/molkart-local:v0.0.1'
+    container 'ghcr.io/schapirolabor/molkart-local:v0.0.3'
 
     input:
     tuple val(meta), path(image_stack), val(num_channels)

diff --git a/modules/local/croptiff.nf b/modules/local/croptiff.nf
@@ -2,7 +2,7 @@ process CROPTIFF {
     tag "$meta.id"
     label 'process_single'
 
-    container 'ghcr.io/schapirolabor/molkart-local:v0.0.1'
+    container 'ghcr.io/schapirolabor/molkart-local:v0.0.3'
 
     input:
     tuple val(meta), path(image_stack)
@@ -17,7 +17,7 @@ process CROPTIFF {
     task.ext.when == null || task.ext.when
 
     script:
-    def args = task.ext.args     ?: ''
+    def args   = task.ext.args   ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
 
     """

diff --git a/modules/local/maskfilter.nf b/modules/local/maskfilter.nf
@@ -2,7 +2,7 @@ process MASKFILTER {
     tag "$meta.id"
     label 'process_medium'
 
-    container 'ghcr.io/schapirolabor/molkart-local:v0.0.1'
+    container 'ghcr.io/schapirolabor/molkart-local:v0.0.3'
 
     input:
     tuple val(meta), path(mask)
@@ -16,7 +16,7 @@ process MASKFILTER {
     task.ext.when == null || task.ext.when
 
     script:
-    def args = task.ext.args ?: ''
+    def args   = task.ext.args   ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
 
     """

diff --git a/modules/local/molkartqc.nf b/modules/local/molkartqc.nf
@@ -2,7 +2,7 @@ process MOLKARTQC{
     tag "$meta.id"
     label 'process_single'
 
-    container 'ghcr.io/schapirolabor/molkart-local:v0.0.1'
+    container 'ghcr.io/schapirolabor/molkart-local:v0.0.3'
 
     input:
     tuple val(meta), path(spot_table), path(cellxgene_table), val(segmethod), path(filterqc)
@@ -15,7 +15,7 @@ process MOLKARTQC{
     task.ext.when == null || task.ext.when
 
     script:
-    def args = task.ext.args ?: ''
+    def args   = task.ext.args   ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
 
     """

diff --git a/modules/local/molkartqcpng.nf b/modules/local/molkartqcpng.nf
@@ -1,7 +1,7 @@
 process MOLKARTQCPNG {
     label 'process_single'
 
-    container 'ghcr.io/schapirolabor/molkart-local:v0.0.1'
+    container 'ghcr.io/schapirolabor/molkart-local:v0.0.3'
 
     input:
     path(png)

diff --git a/modules/local/spot2cell.nf b/modules/local/spot2cell.nf
@@ -3,12 +3,11 @@ process SPOT2CELL{
     tag "$meta.id"
     label 'process_single'
 
-    container 'ghcr.io/schapirolabor/molkart-local:v0.0.1'
+    container 'ghcr.io/schapirolabor/molkart-local:v0.0.3'
 
     input:
     tuple val(meta) , path(spot_table)
     tuple val(meta2), path(cell_mask)
-    val(tag)
 
     output:
     tuple val(meta), path("*.csv"), emit: cellxgene_table
@@ -18,14 +17,13 @@ process SPOT2CELL{
     task.ext.when == null || task.ext.when
 
     script:
-    def args = task.ext.args ?: ''
+    def args   = task.ext.args   ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
 
     """
     spot2cell.py \\
         --spot_table ${spot_table} \\
         --cell_mask ${cell_mask} \\
-        --tag ${tag} \\
         --output ${prefix}.csv \\
         $args