Skip to content

Commit

Permalink
Resolving conflicts with new template
Browse files Browse the repository at this point in the history
  • Loading branch information
kbestak committed Sep 27, 2023
2 parents 3a1dcd8 + cd6ed39 commit ba4ab94
Show file tree
Hide file tree
Showing 32 changed files with 1,266 additions and 218 deletions.
12 changes: 5 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,13 @@

[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23molkart-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/molkart)[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)

## Introduction
## Important note

**nf-core/molkart** is a bioinformatics pipeline that ...
This pipeline is under active development and does not work at this point. If you want to process your Resolve Bioscience data in the meantime, please go to https://github.com/FloWuenne/nf-MolKart, where you can find a working nextflow pipeline, outside the nf-core ecosystem. In the coming weeks, nf-MolKart will be completely transitioned into nf-core/molkart for future use.

<!-- TODO nf-core:
Complete this sentence with a 2-3 sentence summary of what types of data the pipeline ingests, a brief overview of the
major pipeline sections and the types of output it produces. You're giving an overview to someone new
to nf-core here, in 15-20 seconds. For an example, see https://github.com/nf-core/rnaseq/blob/master/README.md#introduction
-->
## Introduction

**nf-core/molkart** is a pipeline for processing Molecular Cartography data from Resolve Bioscience (combinatorial FISH). It takes as input a table of FISH spot positions (x,y,z,gene), a corresponding DAPI image (tiff format) and optionally a membrane staining image in tiff format. nf-core/molkart performs end-to-end processing of the data including image processing, QC filtering of spots, cell segmentation, spot to cell assignment and reports quality metrics such as the spot assignment rate, average spots per cell and segmentation mask size ranges.

<!-- TODO nf-core: Include a figure that guides the user through the major workflow steps. Many nf-core
workflows use the "tube map" design for that. See https://nf-co.re/docs/contributing/design_guidelines#examples for examples. -->
Expand Down
26 changes: 9 additions & 17 deletions assets/schema_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,28 +9,20 @@
"properties": {
"sample": {
"type": "string",
"pattern": "^\\S+$",
"pattern": ".*$",
"errorMessage": "Sample name must be provided and cannot contain spaces"
},
"fastq_1": {
"nuclear_image": {
"type": "string",
"pattern": "^\\S+\\.f(ast)?q\\.gz$",
"errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
"pattern": ".*.tiff$",
"errorMessage": "Nuclear image must be provided, cannot contain spaces and must have extension '.tiff'"
},
"fastq_2": {
"errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'",
"anyOf": [
{
"type": "string",
"pattern": "^\\S+\\.f(ast)?q\\.gz$"
},
{
"type": "string",
"maxLength": 0
}
]
"spot_table": {
"type": "string",
"pattern": ".*.txt$",
"errorMessage": "Spot table must be provided, has to have shape x,y,z,gene with sep = '\t', cannot contain spaces and must have extension '.txt'"
}
},
"required": ["sample", "fastq_1"]
"required": ["sample", "nuclear_image", "spot_table"]
}
}
181 changes: 181 additions & 0 deletions bin/apply_clahe.dask.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
#!/usr/bin/env python
from __future__ import print_function, division
from distutils.log import error
import time
import argparse
from argparse import ArgumentParser as AP
from os.path import abspath
import os
import numpy as np
import tifffile as tf
from skimage.exposure import equalize_adapthist
from multiprocessing.spawn import import_main_path
import sys
import copy
import argparse
import numpy as np
import tifffile
import zarr
import skimage.transform
from aicsimageio import aics_image as AI
from ome_types import from_tiff, to_xml
from os.path import abspath
from argparse import ArgumentParser as AP
import time
#from memory_profiler import profile
# This API is apparently changing in skimage 1.0 but it's not clear to
# me what the replacement will be, if any. We'll explicitly import
# this so it will break loudly if someone tries this with skimage 1.0.
try:
from skimage.util.dtype import _convert as dtype_convert
except ImportError:
from skimage.util.dtype import convert as dtype_convert


def get_args():
# Script description
description="""Easy-to-use, large scale CLAHE"""

# Add parser
parser = AP(description=description, formatter_class=argparse.RawDescriptionHelpFormatter)

# Sections
inputs = parser.add_argument_group(title="Required Input", description="Path to required input file")
inputs.add_argument("-r", "--raw", dest="raw", action="store", required=True, help="File path to input image.")
inputs.add_argument("-o", "--output", dest="output", action="store", required=True, help="Path to output image.")
inputs.add_argument("-c", "--channel", dest="channel", action="store", required=True, help="Channel on which CLAHE will be applied")
inputs.add_argument("-l", "--cliplimit", dest="clip", action="store", required=True, help="Clip Limit for CLAHE")
inputs.add_argument("--kernel", dest="kernel", action="store", required=False, default=None, help="Kernel size for CLAHE")
inputs.add_argument("-g", "--nbins", dest="nbins", action="store", required=False, default=256, help="Number of bins for CLAHE")
inputs.add_argument("-p", "--pixel-size", dest="pixel_size", action="store", required=True, help="Image pixel size")

arg = parser.parse_args()

# Standardize paths
arg.raw = abspath(arg.raw)
arg.channel = int(arg.channel)
arg.clip = float(arg.clip)
arg.pixel_size = float(arg.pixel_size)
arg.nbins = int(arg.nbins)
arg.kernel = int(arg.kernel)

return arg

def preduce(coords, img_in, img_out):
print(img_in.dtype)
(iy1, ix1), (iy2, ix2) = coords
(oy1, ox1), (oy2, ox2) = np.array(coords) // 2
tile = skimage.img_as_float32(img_in[iy1:iy2, ix1:ix2])
tile = skimage.transform.downscale_local_mean(tile, (2, 2))
tile = dtype_convert(tile, 'uint16')
#tile = dtype_convert(tile, img_in.dtype)
img_out[oy1:oy2, ox1:ox2] = tile

def format_shape(shape):
return "%dx%d" % (shape[1], shape[0])

def subres_tiles(level, level_full_shapes, tile_shapes, outpath, scale):
print(f"\n processing level {level}")
assert level >= 1
num_channels, h, w = level_full_shapes[level]
tshape = tile_shapes[level] or (h, w)
tiff = tifffile.TiffFile(outpath)
zimg = zarr.open(tiff.aszarr(series=0, level=level-1, squeeze=False))
for c in range(num_channels):
sys.stdout.write(
f"\r processing channel {c + 1}/{num_channels}"
)
sys.stdout.flush()
th = tshape[0] * scale
tw = tshape[1] * scale
for y in range(0, zimg.shape[1], th):
for x in range(0, zimg.shape[2], tw):
a = zimg[c, y:y+th, x:x+tw, 0]
a = skimage.transform.downscale_local_mean(
a, (scale, scale)
)
if np.issubdtype(zimg.dtype, np.integer):
a = np.around(a)
a = a.astype('uint16')
yield a

def main(args):
print(f"Head directory = {args.raw}")
print(f"Channel = {args.channel}, ClipLimit = {args.clip}, nbins = {args.nbins}, kernel_size = {args.kernel}, pixel_size = {args.pixel_size}")

#clahe = cv2.createCLAHE(clipLimit = int(args.clip), tileGridSize=tuple(map(int, args.grid)))

img_raw = AI.AICSImage(args.raw)
img_dask = img_raw.get_image_dask_data("CYX").astype('uint16')
adapted = img_dask[args.channel].compute()/65535
adapted = (equalize_adapthist(adapted, kernel_size=args.kernel, clip_limit=args.clip, nbins=args.nbins)*65535).astype('uint16')
img_dask[args.channel] = adapted

# construct levels
tile_size = 1024
scale = 2

pixel_size = args.pixel_size
dtype = img_dask.dtype
base_shape = img_dask[0].shape
num_channels = img_dask.shape[0]
num_levels = (np.ceil(np.log2(max(base_shape) / tile_size)) + 1).astype(int)
factors = 2 ** np.arange(num_levels)
shapes = (np.ceil(np.array(base_shape) / factors[:,None])).astype(int)

print("Pyramid level sizes: ")
for i, shape in enumerate(shapes):
print(f" level {i+1}: {format_shape(shape)}", end="")
if i == 0:
print("(original size)", end="")
print()
print()
print(shapes)

level_full_shapes = []
for shape in shapes:
level_full_shapes.append((num_channels, shape[0], shape[1]))
level_shapes = shapes
tip_level = np.argmax(np.all(level_shapes < tile_size, axis=1))
tile_shapes = [
(tile_size, tile_size) if i <= tip_level else None
for i in range(len(level_shapes))
]

# write pyramid
with tifffile.TiffWriter(args.output, ome=True, bigtiff=True) as tiff:
tiff.write(
data = img_dask,
shape = level_full_shapes[0],
subifds=int(num_levels-1),
dtype=dtype,
resolution=(10000 / pixel_size, 10000 / pixel_size, "centimeter"),
tile=tile_shapes[0]
)
for level, (shape, tile_shape) in enumerate(
zip(level_full_shapes[1:], tile_shapes[1:]), 1
):
tiff.write(
data = subres_tiles(level, level_full_shapes, tile_shapes, args.output, scale),
shape=shape,
subfiletype=1,
dtype=dtype,
tile=tile_shape
)

# note about metadata: the channels, planes etc were adjusted not to include the removed channels, however
# the channel ids have stayed the same as before removal. E.g if channels 1 and 2 are removed,
# the channel ids in the metadata will skip indices 1 and 2 (channel_id:0, channel_id:3, channel_id:4 ...)
#tifffile.tiffcomment(args.output, to_xml(metadata))
print()


if __name__ == '__main__':
# Read in arguments
args = get_args()

# Run script
st = time.time()
main(args)
rt = time.time() - st
print(f"Script finished in {rt // 60:.0f}m {rt % 60:.0f}s")
84 changes: 84 additions & 0 deletions bin/collect_QC.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/env python

#### This script takes regionprops_tabe output from mcquant and the raw spot tables from Resolve bioscience as input
#### and calculates some QC metrics for masks and spot assignments

import argparse
import pandas as pd

def summarize_spots(spot_table):
## Calculate number of spots per gene
tx_per_gene = spot_table.groupby('gene').count().reset_index()

## Calculate the total number of spots in spot_table
total_spots = spot_table.shape[0]

return(tx_per_gene,total_spots)

def summarize_segmasks(mcquant,spots_summary):
## Calculate the total number of cells (rows) in mcquant
total_cells = mcquant.shape[0]

## Calculate the average segmentation area from column Area in mcquant
avg_area = mcquant['Area'].mean()

## Calculate the % of spots assigned
## Subset mcquant for all columns with _intensity_sum in the column name and sum the column values
spot_assign = mcquant.filter(regex='_intensity_sum').sum(axis=1)
spot_assign_total = int(sum(spot_assign))
spot_assign_per_cell = total_cells and spot_assign_total / total_cells or 0
#spot_assign_per_cell = spot_assign_total / total_cells
spot_assign_percent = spot_assign_total / spots_summary[1] * 100

return(total_cells,avg_area,spot_assign_per_cell,spot_assign_total,spot_assign_percent)

if __name__ == "__main__":
# Write an argparse with input options mcquant_in, spots and output options outdir, sample_id
parser = argparse.ArgumentParser()
parser.add_argument(
"-i",
"--mcquant",
help="mcquant regionprops_table."
)
parser.add_argument(
"-s",
"--spots",
help="Resolve biosciences spot table."
)
parser.add_argument(
"-o",
"--outdir",
help="Output directory."
)

parser.add_argument(
"-d",
"--sample_id",
help="Sample ID."
)

parser.add_argument(
"-g",
"--segmentation_method",
help="Segmentation method used."
)

args = parser.parse_args()

## Read in mcquant table
mcquant = pd.read_csv(args.mcquant)

## Read in spot table
spots = pd.read_table(args.spots, sep='\t', names=['x', 'y', 'z', 'gene'])
spots = spots[~spots.gene.str.contains("Duplicated")]

## Summarize spots table
summary_spots = summarize_spots(spots)
summary_segmentation = summarize_segmasks(mcquant,summary_spots)

## Create pandas data frame with one row per parameter and write each value in summary_segmentation to a new row in the data frame
summary_df = pd.DataFrame(columns=['sample_id','segmentation_method','total_cells','avg_area','total_spots','spot_assign_per_cell','spot_assign_total','spot_assign_percent',])
summary_df.loc[0] = [args.sample_id + "_" + args.segmentation_method,args.segmentation_method,summary_segmentation[0],summary_segmentation[1],summary_spots[1],summary_segmentation[2],summary_segmentation[3],summary_segmentation[4]]

# Write summary_df to a csv file
summary_df.to_csv(f"{args.outdir}/{args.sample_id}.{args.segmentation_method}.spot_QC.csv", header = True, index=False)
Loading

0 comments on commit ba4ab94

Please sign in to comment.