Resolving conflicts with new template

nf-core · Sep 27, 2023 · ba4ab94 · ba4ab94
2 parents 3a1dcd8 + cd6ed39
commit ba4ab94
Show file tree

Hide file tree

Showing 32 changed files with 1,266 additions and 218 deletions.
diff --git a/README.md b/README.md
@@ -11,15 +11,13 @@
 
 [![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23molkart-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/molkart)[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)
 
-## Introduction
+## Important note
 
-**nf-core/molkart** is a bioinformatics pipeline that ...
+This pipeline is under active development and does not work at this point. If you want to process your Resolve Bioscience data in the meantime, please go to https://github.com/FloWuenne/nf-MolKart, where you can find a working nextflow pipeline, outside the nf-core ecosystem. In the coming weeks, nf-MolKart will be completely transitioned into nf-core/molkart for future use.
 
-<!-- TODO nf-core:
-   Complete this sentence with a 2-3 sentence summary of what types of data the pipeline ingests, a brief overview of the
-   major pipeline sections and the types of output it produces. You're giving an overview to someone new
-   to nf-core here, in 15-20 seconds. For an example, see https://github.com/nf-core/rnaseq/blob/master/README.md#introduction
--->
+## Introduction
+
+**nf-core/molkart** is a pipeline for processing Molecular Cartography data from Resolve Bioscience (combinatorial FISH). It takes as input a table of FISH spot positions (x,y,z,gene), a corresponding DAPI image (tiff format) and optionally a membrane staining image in tiff format. nf-core/molkart performs end-to-end processing of the data including image processing, QC filtering of spots, cell segmentation, spot to cell assignment and reports quality metrics such as the spot assignment rate, average spots per cell and segmentation mask size ranges.
 
 <!-- TODO nf-core: Include a figure that guides the user through the major workflow steps. Many nf-core
      workflows use the "tube map" design for that. See https://nf-co.re/docs/contributing/design_guidelines#examples for examples.   -->

diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -9,28 +9,20 @@
         "properties": {
             "sample": {
                 "type": "string",
-                "pattern": "^\\S+$",
+                "pattern": ".*$",
                 "errorMessage": "Sample name must be provided and cannot contain spaces"
             },
-            "fastq_1": {
+            "nuclear_image": {
                 "type": "string",
-                "pattern": "^\\S+\\.f(ast)?q\\.gz$",
-                "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
+                "pattern": ".*.tiff$",
+                "errorMessage": "Nuclear image must be provided, cannot contain spaces and must have extension '.tiff'"
             },
-            "fastq_2": {
-                "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'",
-                "anyOf": [
-                    {
-                        "type": "string",
-                        "pattern": "^\\S+\\.f(ast)?q\\.gz$"
-                    },
-                    {
-                        "type": "string",
-                        "maxLength": 0
-                    }
-                ]
+            "spot_table": {
+                "type": "string",
+                "pattern": ".*.txt$",
+                "errorMessage": "Spot table must be provided, has to have shape x,y,z,gene with sep = '\t', cannot contain spaces and must have extension '.txt'"
             }
         },
-        "required": ["sample", "fastq_1"]
+        "required": ["sample", "nuclear_image", "spot_table"]
     }
 }
diff --git a/bin/apply_clahe.dask.py b/bin/apply_clahe.dask.py
@@ -0,0 +1,181 @@
+#!/usr/bin/env python
+from __future__ import print_function, division
+from distutils.log import error
+import time
+import argparse
+from argparse import ArgumentParser as AP
+from os.path import abspath
+import os
+import numpy as np
+import tifffile as tf
+from skimage.exposure import equalize_adapthist
+from multiprocessing.spawn import import_main_path
+import sys
+import copy
+import argparse
+import numpy as np
+import tifffile
+import zarr
+import skimage.transform
+from aicsimageio import aics_image as AI
+from ome_types import from_tiff, to_xml
+from os.path import abspath
+from argparse import ArgumentParser as AP
+import time
+#from memory_profiler import profile
+# This API is apparently changing in skimage 1.0 but it's not clear to
+# me what the replacement will be, if any. We'll explicitly import
+# this so it will break loudly if someone tries this with skimage 1.0.
+try:
+    from skimage.util.dtype import _convert as dtype_convert
+except ImportError:
+    from skimage.util.dtype import convert as dtype_convert
+
+
+def get_args():
+    # Script description
+    description="""Easy-to-use, large scale CLAHE"""
+
+    # Add parser
+    parser = AP(description=description, formatter_class=argparse.RawDescriptionHelpFormatter)
+
+    # Sections
+    inputs = parser.add_argument_group(title="Required Input", description="Path to required input file")
+    inputs.add_argument("-r", "--raw", dest="raw", action="store", required=True, help="File path to input image.")
+    inputs.add_argument("-o", "--output", dest="output", action="store", required=True, help="Path to output image.")
+    inputs.add_argument("-c", "--channel", dest="channel", action="store", required=True, help="Channel on which CLAHE will be applied")
+    inputs.add_argument("-l", "--cliplimit", dest="clip", action="store", required=True, help="Clip Limit for CLAHE")
+    inputs.add_argument("--kernel", dest="kernel", action="store", required=False, default=None, help="Kernel size for CLAHE")
+    inputs.add_argument("-g", "--nbins", dest="nbins", action="store", required=False, default=256, help="Number of bins for CLAHE")
+    inputs.add_argument("-p", "--pixel-size", dest="pixel_size", action="store", required=True, help="Image pixel size")
+
+    arg = parser.parse_args()
+
+    # Standardize paths
+    arg.raw = abspath(arg.raw)
+    arg.channel = int(arg.channel)
+    arg.clip = float(arg.clip)
+    arg.pixel_size = float(arg.pixel_size)
+    arg.nbins = int(arg.nbins)
+    arg.kernel = int(arg.kernel)
+
+    return arg
+
+def preduce(coords, img_in, img_out):
+    print(img_in.dtype)
+    (iy1, ix1), (iy2, ix2) = coords
+    (oy1, ox1), (oy2, ox2) = np.array(coords) // 2
+    tile = skimage.img_as_float32(img_in[iy1:iy2, ix1:ix2])
+    tile = skimage.transform.downscale_local_mean(tile, (2, 2))
+    tile = dtype_convert(tile, 'uint16')
+    #tile = dtype_convert(tile, img_in.dtype)
+    img_out[oy1:oy2, ox1:ox2] = tile
+
+def format_shape(shape):
+    return "%dx%d" % (shape[1], shape[0])
+
+def subres_tiles(level, level_full_shapes, tile_shapes, outpath, scale):
+    print(f"\n processing level {level}")
+    assert level >= 1
+    num_channels, h, w = level_full_shapes[level]
+    tshape = tile_shapes[level] or (h, w)
+    tiff = tifffile.TiffFile(outpath)
+    zimg = zarr.open(tiff.aszarr(series=0, level=level-1, squeeze=False))
+    for c in range(num_channels):
+        sys.stdout.write(
+            f"\r  processing channel {c + 1}/{num_channels}"
+        )
+        sys.stdout.flush()
+        th = tshape[0] * scale
+        tw = tshape[1] * scale
+        for y in range(0, zimg.shape[1], th):
+            for x in range(0, zimg.shape[2], tw):
+                a = zimg[c, y:y+th, x:x+tw, 0]
+                a = skimage.transform.downscale_local_mean(
+                    a, (scale, scale)
+                )
+                if np.issubdtype(zimg.dtype, np.integer):
+                    a = np.around(a)
+                a = a.astype('uint16')
+                yield a
+
+def main(args):
+    print(f"Head directory = {args.raw}")
+    print(f"Channel = {args.channel}, ClipLimit = {args.clip}, nbins = {args.nbins}, kernel_size = {args.kernel}, pixel_size = {args.pixel_size}")
+
+    #clahe = cv2.createCLAHE(clipLimit = int(args.clip), tileGridSize=tuple(map(int, args.grid)))
+
+    img_raw = AI.AICSImage(args.raw)
+    img_dask = img_raw.get_image_dask_data("CYX").astype('uint16')
+    adapted = img_dask[args.channel].compute()/65535
+    adapted = (equalize_adapthist(adapted, kernel_size=args.kernel, clip_limit=args.clip, nbins=args.nbins)*65535).astype('uint16')
+    img_dask[args.channel] = adapted
+
+    # construct levels
+    tile_size = 1024
+    scale = 2
+
+    pixel_size = args.pixel_size
+    dtype = img_dask.dtype
+    base_shape = img_dask[0].shape
+    num_channels = img_dask.shape[0]
+    num_levels = (np.ceil(np.log2(max(base_shape) / tile_size)) + 1).astype(int)
+    factors = 2 ** np.arange(num_levels)
+    shapes = (np.ceil(np.array(base_shape) / factors[:,None])).astype(int)
+
+    print("Pyramid level sizes: ")
+    for i, shape in enumerate(shapes):
+        print(f"   level {i+1}: {format_shape(shape)}", end="")
+        if i == 0:
+            print("(original size)", end="")
+        print()
+    print()
+    print(shapes)
+
+    level_full_shapes = []
+    for shape in shapes:
+        level_full_shapes.append((num_channels, shape[0], shape[1]))
+    level_shapes = shapes
+    tip_level = np.argmax(np.all(level_shapes < tile_size, axis=1))
+    tile_shapes = [
+        (tile_size, tile_size) if i <= tip_level else None
+        for i in range(len(level_shapes))
+    ]
+
+    # write pyramid
+    with tifffile.TiffWriter(args.output, ome=True, bigtiff=True) as tiff:
+        tiff.write(
+            data = img_dask,
+            shape = level_full_shapes[0],
+            subifds=int(num_levels-1),
+            dtype=dtype,
+            resolution=(10000 / pixel_size, 10000 / pixel_size, "centimeter"),
+            tile=tile_shapes[0]
+        )
+        for level, (shape, tile_shape) in enumerate(
+                zip(level_full_shapes[1:], tile_shapes[1:]), 1
+        ):
+            tiff.write(
+                data = subres_tiles(level, level_full_shapes, tile_shapes, args.output, scale),
+                shape=shape,
+                subfiletype=1,
+                dtype=dtype,
+                tile=tile_shape
+            )
+
+    # note about metadata: the channels, planes etc were adjusted not to include the removed channels, however
+    # the channel ids have stayed the same as before removal. E.g if channels 1 and 2 are removed,
+    # the channel ids in the metadata will skip indices 1 and 2 (channel_id:0, channel_id:3, channel_id:4 ...)
+    #tifffile.tiffcomment(args.output, to_xml(metadata))
+    print()
+
+
+if __name__ == '__main__':
+    # Read in arguments
+    args = get_args()
+
+    # Run script
+    st = time.time()
+    main(args)
+    rt = time.time() - st
+    print(f"Script finished in {rt // 60:.0f}m {rt % 60:.0f}s")
diff --git a/bin/collect_QC.py b/bin/collect_QC.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python
+
+#### This script takes regionprops_tabe output from mcquant and the raw spot tables from Resolve bioscience as input
+#### and calculates some QC metrics for masks and spot assignments
+
+import argparse
+import pandas as pd
+
+def summarize_spots(spot_table):
+    ## Calculate number of spots per gene
+    tx_per_gene = spot_table.groupby('gene').count().reset_index()
+
+    ## Calculate the total number of spots in spot_table
+    total_spots = spot_table.shape[0]
+
+    return(tx_per_gene,total_spots)
+
+def summarize_segmasks(mcquant,spots_summary):
+    ## Calculate the total number of cells (rows) in mcquant
+    total_cells = mcquant.shape[0]
+
+    ## Calculate the average segmentation area from column Area in mcquant
+    avg_area = mcquant['Area'].mean()
+
+    ## Calculate the % of spots assigned
+    ## Subset mcquant for all columns with _intensity_sum in the column name and sum the column values
+    spot_assign = mcquant.filter(regex='_intensity_sum').sum(axis=1)
+    spot_assign_total = int(sum(spot_assign))
+    spot_assign_per_cell =  total_cells and spot_assign_total / total_cells or 0
+    #spot_assign_per_cell = spot_assign_total / total_cells
+    spot_assign_percent = spot_assign_total / spots_summary[1] * 100
+
+    return(total_cells,avg_area,spot_assign_per_cell,spot_assign_total,spot_assign_percent)
+
+if __name__ == "__main__":
+    # Write an argparse with input options mcquant_in, spots and output options outdir, sample_id
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-i",
+        "--mcquant",
+        help="mcquant regionprops_table."
+        )
+    parser.add_argument(
+        "-s",
+        "--spots",
+        help="Resolve biosciences spot table."
+        )
+    parser.add_argument(
+        "-o",
+        "--outdir",
+        help="Output directory."
+    )
+
+    parser.add_argument(
+        "-d",
+        "--sample_id",
+        help="Sample ID."
+        )
+
+    parser.add_argument(
+        "-g",
+        "--segmentation_method",
+        help="Segmentation method used."
+        )
+
+    args = parser.parse_args()
+
+    ## Read in mcquant table
+    mcquant = pd.read_csv(args.mcquant)
+
+    ## Read in spot table
+    spots = pd.read_table(args.spots, sep='\t', names=['x', 'y', 'z', 'gene'])
+    spots = spots[~spots.gene.str.contains("Duplicated")]
+
+    ## Summarize spots table
+    summary_spots = summarize_spots(spots)
+    summary_segmentation = summarize_segmasks(mcquant,summary_spots)
+
+    ## Create pandas data frame with one row per parameter and write each value in summary_segmentation to a new row in the data frame
+    summary_df = pd.DataFrame(columns=['sample_id','segmentation_method','total_cells','avg_area','total_spots','spot_assign_per_cell','spot_assign_total','spot_assign_percent',])
+    summary_df.loc[0] = [args.sample_id + "_" + args.segmentation_method,args.segmentation_method,summary_segmentation[0],summary_segmentation[1],summary_spots[1],summary_segmentation[2],summary_segmentation[3],summary_segmentation[4]]
+
+    # Write summary_df to a csv file
+    summary_df.to_csv(f"{args.outdir}/{args.sample_id}.{args.segmentation_method}.spot_QC.csv", header = True, index=False)