Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

First minimal working version of pipeline #2

Merged
merged 9 commits into from
Sep 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 9 additions & 17 deletions assets/schema_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,28 +9,20 @@
"properties": {
"sample": {
"type": "string",
"pattern": "^\\S+$",
"pattern": ".*$",
"errorMessage": "Sample name must be provided and cannot contain spaces"
},
"fastq_1": {
"nuclear_image": {
"type": "string",
"pattern": "^\\S+\\.f(ast)?q\\.gz$",
"errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
"pattern": ".*.tiff$",
"errorMessage": "Nuclear image must be provided, cannot contain spaces and must have extension '.tiff'"
},
"fastq_2": {
"errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'",
"anyOf": [
{
"type": "string",
"pattern": "^\\S+\\.f(ast)?q\\.gz$"
},
{
"type": "string",
"maxLength": 0
}
]
"spot_table": {
"type": "string",
"pattern": ".*.txt$",
"errorMessage": "Spot table must be provided, has to have shape x,y,z,gene with sep = '\t', cannot contain spaces and must have extension '.txt'"
}
},
"required": ["sample", "fastq_1"]
"required": ["sample", "nuclear_image", "spot_table"]
}
}
181 changes: 181 additions & 0 deletions bin/apply_clahe.dask.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
#!/usr/bin/env python
from __future__ import print_function, division
from distutils.log import error
import time
import argparse
from argparse import ArgumentParser as AP
from os.path import abspath
import os
import numpy as np
import tifffile as tf
from skimage.exposure import equalize_adapthist
from multiprocessing.spawn import import_main_path
import sys
import copy
import argparse
import numpy as np
import tifffile
import zarr
import skimage.transform
from aicsimageio import aics_image as AI
from ome_types import from_tiff, to_xml
from os.path import abspath
from argparse import ArgumentParser as AP
import time
#from memory_profiler import profile
# This API is apparently changing in skimage 1.0 but it's not clear to
# me what the replacement will be, if any. We'll explicitly import
# this so it will break loudly if someone tries this with skimage 1.0.
try:
from skimage.util.dtype import _convert as dtype_convert
except ImportError:
from skimage.util.dtype import convert as dtype_convert


def get_args():
# Script description
description="""Easy-to-use, large scale CLAHE"""

# Add parser
parser = AP(description=description, formatter_class=argparse.RawDescriptionHelpFormatter)

# Sections
inputs = parser.add_argument_group(title="Required Input", description="Path to required input file")
inputs.add_argument("-r", "--raw", dest="raw", action="store", required=True, help="File path to input image.")
inputs.add_argument("-o", "--output", dest="output", action="store", required=True, help="Path to output image.")
inputs.add_argument("-c", "--channel", dest="channel", action="store", required=True, help="Channel on which CLAHE will be applied")
inputs.add_argument("-l", "--cliplimit", dest="clip", action="store", required=True, help="Clip Limit for CLAHE")
inputs.add_argument("--kernel", dest="kernel", action="store", required=False, default=None, help="Kernel size for CLAHE")
inputs.add_argument("-g", "--nbins", dest="nbins", action="store", required=False, default=256, help="Number of bins for CLAHE")
inputs.add_argument("-p", "--pixel-size", dest="pixel_size", action="store", required=True, help="Image pixel size")

arg = parser.parse_args()

# Standardize paths
arg.raw = abspath(arg.raw)
arg.channel = int(arg.channel)
arg.clip = float(arg.clip)
arg.pixel_size = float(arg.pixel_size)
arg.nbins = int(arg.nbins)
arg.kernel = int(arg.kernel)

return arg

def preduce(coords, img_in, img_out):
print(img_in.dtype)
(iy1, ix1), (iy2, ix2) = coords
(oy1, ox1), (oy2, ox2) = np.array(coords) // 2
tile = skimage.img_as_float32(img_in[iy1:iy2, ix1:ix2])
tile = skimage.transform.downscale_local_mean(tile, (2, 2))
tile = dtype_convert(tile, 'uint16')
#tile = dtype_convert(tile, img_in.dtype)
img_out[oy1:oy2, ox1:ox2] = tile

def format_shape(shape):
return "%dx%d" % (shape[1], shape[0])

def subres_tiles(level, level_full_shapes, tile_shapes, outpath, scale):
print(f"\n processing level {level}")
assert level >= 1
num_channels, h, w = level_full_shapes[level]
tshape = tile_shapes[level] or (h, w)
tiff = tifffile.TiffFile(outpath)
zimg = zarr.open(tiff.aszarr(series=0, level=level-1, squeeze=False))
for c in range(num_channels):
sys.stdout.write(
f"\r processing channel {c + 1}/{num_channels}"
)
sys.stdout.flush()
th = tshape[0] * scale
tw = tshape[1] * scale
for y in range(0, zimg.shape[1], th):
for x in range(0, zimg.shape[2], tw):
a = zimg[c, y:y+th, x:x+tw, 0]
a = skimage.transform.downscale_local_mean(
a, (scale, scale)
)
if np.issubdtype(zimg.dtype, np.integer):
a = np.around(a)
a = a.astype('uint16')
yield a

def main(args):
print(f"Head directory = {args.raw}")
print(f"Channel = {args.channel}, ClipLimit = {args.clip}, nbins = {args.nbins}, kernel_size = {args.kernel}, pixel_size = {args.pixel_size}")

#clahe = cv2.createCLAHE(clipLimit = int(args.clip), tileGridSize=tuple(map(int, args.grid)))

img_raw = AI.AICSImage(args.raw)
img_dask = img_raw.get_image_dask_data("CYX").astype('uint16')
adapted = img_dask[args.channel].compute()/65535
adapted = (equalize_adapthist(adapted, kernel_size=args.kernel, clip_limit=args.clip, nbins=args.nbins)*65535).astype('uint16')
img_dask[args.channel] = adapted

# construct levels
tile_size = 1024
scale = 2

pixel_size = args.pixel_size
dtype = img_dask.dtype
base_shape = img_dask[0].shape
num_channels = img_dask.shape[0]
num_levels = (np.ceil(np.log2(max(base_shape) / tile_size)) + 1).astype(int)
factors = 2 ** np.arange(num_levels)
shapes = (np.ceil(np.array(base_shape) / factors[:,None])).astype(int)

print("Pyramid level sizes: ")
for i, shape in enumerate(shapes):
print(f" level {i+1}: {format_shape(shape)}", end="")
if i == 0:
print("(original size)", end="")
print()
print()
print(shapes)

level_full_shapes = []
for shape in shapes:
level_full_shapes.append((num_channels, shape[0], shape[1]))
level_shapes = shapes
tip_level = np.argmax(np.all(level_shapes < tile_size, axis=1))
tile_shapes = [
(tile_size, tile_size) if i <= tip_level else None
for i in range(len(level_shapes))
]

# write pyramid
with tifffile.TiffWriter(args.output, ome=True, bigtiff=True) as tiff:
tiff.write(
data = img_dask,
shape = level_full_shapes[0],
subifds=int(num_levels-1),
dtype=dtype,
resolution=(10000 / pixel_size, 10000 / pixel_size, "centimeter"),
tile=tile_shapes[0]
)
for level, (shape, tile_shape) in enumerate(
zip(level_full_shapes[1:], tile_shapes[1:]), 1
):
tiff.write(
data = subres_tiles(level, level_full_shapes, tile_shapes, args.output, scale),
shape=shape,
subfiletype=1,
dtype=dtype,
tile=tile_shape
)

# note about metadata: the channels, planes etc were adjusted not to include the removed channels, however
# the channel ids have stayed the same as before removal. E.g if channels 1 and 2 are removed,
# the channel ids in the metadata will skip indices 1 and 2 (channel_id:0, channel_id:3, channel_id:4 ...)
#tifffile.tiffcomment(args.output, to_xml(metadata))
print()


if __name__ == '__main__':
# Read in arguments
args = get_args()

# Run script
st = time.time()
main(args)
rt = time.time() - st
print(f"Script finished in {rt // 60:.0f}m {rt % 60:.0f}s")
84 changes: 84 additions & 0 deletions bin/collect_QC.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/env python

#### This script takes regionprops_tabe output from mcquant and the raw spot tables from Resolve bioscience as input
#### and calculates some QC metrics for masks and spot assignments

import argparse
import pandas as pd

def summarize_spots(spot_table):
## Calculate number of spots per gene
tx_per_gene = spot_table.groupby('gene').count().reset_index()

## Calculate the total number of spots in spot_table
total_spots = spot_table.shape[0]

return(tx_per_gene,total_spots)

def summarize_segmasks(mcquant,spots_summary):
## Calculate the total number of cells (rows) in mcquant
total_cells = mcquant.shape[0]

## Calculate the average segmentation area from column Area in mcquant
avg_area = mcquant['Area'].mean()

## Calculate the % of spots assigned
## Subset mcquant for all columns with _intensity_sum in the column name and sum the column values
spot_assign = mcquant.filter(regex='_intensity_sum').sum(axis=1)
spot_assign_total = int(sum(spot_assign))
spot_assign_per_cell = total_cells and spot_assign_total / total_cells or 0
#spot_assign_per_cell = spot_assign_total / total_cells
spot_assign_percent = spot_assign_total / spots_summary[1] * 100

return(total_cells,avg_area,spot_assign_per_cell,spot_assign_total,spot_assign_percent)

if __name__ == "__main__":
# Write an argparse with input options mcquant_in, spots and output options outdir, sample_id
parser = argparse.ArgumentParser()
parser.add_argument(
"-i",
"--mcquant",
help="mcquant regionprops_table."
)
parser.add_argument(
"-s",
"--spots",
help="Resolve biosciences spot table."
)
parser.add_argument(
"-o",
"--outdir",
help="Output directory."
)

parser.add_argument(
"-d",
"--sample_id",
help="Sample ID."
)

parser.add_argument(
"-g",
"--segmentation_method",
help="Segmentation method used."
)

args = parser.parse_args()

## Read in mcquant table
mcquant = pd.read_csv(args.mcquant)

## Read in spot table
spots = pd.read_table(args.spots, sep='\t', names=['x', 'y', 'z', 'gene'])
spots = spots[~spots.gene.str.contains("Duplicated")]

## Summarize spots table
summary_spots = summarize_spots(spots)
summary_segmentation = summarize_segmasks(mcquant,summary_spots)

## Create pandas data frame with one row per parameter and write each value in summary_segmentation to a new row in the data frame
summary_df = pd.DataFrame(columns=['sample_id','segmentation_method','total_cells','avg_area','total_spots','spot_assign_per_cell','spot_assign_total','spot_assign_percent',])
summary_df.loc[0] = [args.sample_id + "_" + args.segmentation_method,args.segmentation_method,summary_segmentation[0],summary_segmentation[1],summary_spots[1],summary_segmentation[2],summary_segmentation[3],summary_segmentation[4]]

# Write summary_df to a csv file
summary_df.to_csv(f"{args.outdir}/{args.sample_id}.{args.segmentation_method}.spot_QC.csv", header = True, index=False)
75 changes: 75 additions & 0 deletions bin/project_spots.dask.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/usr/bin/env python

#### This script takes a table of Molecular Cartography spots as input and projects them
#### onto a reference image. The output is a stack of images, one for each spot, with
#### the spot projected onto the reference image shape.

## Import packages
import argparse
import pandas as pd
import numpy as np
import dask.array as da
import dask.dataframe as dd
import tifffile
from rich.progress import track
from aicsimageio.writers import OmeTiffWriter

# Make a function to project a table of spots with x,y coordinates onto a 2d plane based on reference image shape and add any duplicate spots to increase their pixel value in the output image
def project_spots(spot_table,img):
# Initialize an empty image with the same shape as the reference image
img = np.zeros_like(img, dtype= 'int8')
# Iterate through each spot in the table
for spot in spot_table.itertuples():
# Add the corresponding spot count to the pixel value at the spot's x,y coordinates
img[spot.y, spot.x] += spot.counts
return img

if __name__ == "__main__":
# Add a python argument parser with options for input, output and image size in x and y
parser = argparse.ArgumentParser()
parser.add_argument(
"-i",
"--input",
help="Spot table to project."
)
parser.add_argument(
"-s",
"--sample_id",
help="Sample ID."
)
parser.add_argument(
"-d",
"--img_dims",
dest="img_dims",
help="Corresponding image to get dimensions from."
)

args = parser.parse_args()

#spots = pd.read_csv(args.input)
spots = dd.read_table(args.input, sep='\t', names=['x', 'y', 'z', 'gene']).compute()
img = tifffile.imread(args.img_dims)

spots = spots[["y","x", "gene"]]

## Filter any genes marked with Duplicated
spots = spots[~spots.gene.str.contains("Duplicated")]

# Sum spots by z-axis
spots_zsum = spots.value_counts().to_frame('counts').reset_index()

# Project each gene into a 2d plane and add to list
# Add a printed message that says "Projecting spots for gene X" for each gene in the list
spots_2d_list = [project_spots(spots_zsum[spots_zsum.gene == gene], img) for gene in track(spots_zsum.gene.unique(), description='[green]Projecting spots...')]

# Stack images on the c-axis
spot_2d_stack = da.stack(spots_2d_list, axis=0)

## Write a csv file containing the channel names
channel_names = spots_zsum.gene.unique().tolist()
pd.DataFrame(channel_names).to_csv(args.sample_id + ".channel_names.csv", index=False, header=False)

#tifffile.imwrite(args.output, spot_2d_stack, metadata={'axes': 'CYX'})
OmeTiffWriter.save(spot_2d_stack,
args.sample_id + ".tiff",
dim_order = "CYX")
Loading
Loading