Skip to content

Commit

Permalink
[FEAT] Add new Snakefile and rules for analysis no_hashing
Browse files Browse the repository at this point in the history
  • Loading branch information
Anne Bertolini committed Aug 30, 2023
1 parent c26467b commit 9e451f5
Show file tree
Hide file tree
Showing 3 changed files with 255 additions and 0 deletions.
112 changes: 112 additions & 0 deletions workflow/Snakefile_no_hashing.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
# Define minimum Snakemake Version
from snakemake.utils import min_version

min_version("6.12.1")


# Include Config file
configfile: "config/config.yaml"


# Include report functionality
report: "../report/workflow.rst"


# This file includes common functions used in the pipeline
include: "rules/misc_snake.smk"


## Include scampi as a module
module scampi:
skip_validation:
True
snakefile:
github(
"ETH-NEXUS/scAmpi_single_cell_RNA",
path="workflow/snakefile_basic.smk",
tag="v2.0.7",
)
config:
config["scampi"]


## Include the preprocessing rules
include: "rules/gex_cellranger_no_hashing.smk"
include: "rules/adt_cellranger_no_hashing.smk"
## Include scampi
include: "rules/scampi_module.smk"
## Include citseq rules
include: "rules/adt_analyse_citeseq.smk"


# This rule defines which files should be created
localrules:
gExcite,


rule gExcite:
input:
# Final files from preprocessing
expand(
"results/cellranger_gex/{sample}.matrix.mtx",
sample=getSimpleSampleNames(),
),
expand(
"results/cellranger_gex/{sample}.barcodes.tsv",
sample=getSimpleSampleNames(),
),
expand(
"results/cellranger_gex/{sample}.features.tsv",
sample=getSimpleSampleNames(),
),
expand(
"results/cellranger_adt/{sample}.matrix.mtx",
sample=getSimpleSampleNames(),
),
# List of final files from scampi
expand(
"results/counts_raw/{sample}.h5",
sample=getSimpleSampleNames(),
),
expand(
"results/counts_filtered/{sample}.doublet_barcodes.txt",
sample=getSimpleSampleNames(),
),
expand(
"results/counts_raw/{sample}.raw.histogram_library_sizes.png",
sample=getSimpleSampleNames(),
),
expand(
"results/counts_corrected/{sample}.corrected.RDS",
sample=getSimpleSampleNames(),
),
expand(
"results/clustering/{sample}.clusters_phenograph.csv",
sample=getSimpleSampleNames(),
),
expand(
"results/gene_exp/{sample}.gene_expression_per_cluster.tsv",
sample=getSimpleSampleNames(),
),
expand(
"results/plotting/{sample}.celltype_barplot.png",
sample=getSimpleSampleNames(),
),
expand(
"results/gsva/{sample}.gsetscore_hm.png",
sample=getSimpleSampleNames(),
),
# List of final files from citeseq analysis
expand(
"results/citeseq_analysis/{sample}/{sample}.GEX_cellrangerADT_SCE.RDS",
sample=getSimpleSampleNames(),
),
output:
"results/complete_gExcite.txt",
resources:
mem_mb=1000,
time=1,
benchmark:
"results/complete_gExcite.benchmark"
shell:
"date > {output} "
82 changes: 82 additions & 0 deletions workflow/rules/adt_cellranger_no_hashing.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# Rules required for the basic cellranger run of sc ADT data

import os

WORKDIR = os.getcwd()


# Preprocessing to generate library file for ADT cellranger from the provided input parameters
# Library file has a fixed format: fastqs,sample,library_type (with header)
rule create_library_file_adt:
input:
fastqs_dir=WORKDIR
+ "/"
+ config["inputOutput"]["input_fastqs_adt"]
+ "{sample}/",
output:
library_file=WORKDIR + "/results/cellranger_adt/{sample}.adt_library.txt",
threads: config["computingResources"]["threads"]["low"]
resources:
mem_mb=config["computingResources"]["mem_mb"]["low"],
runtime=config["computingResources"]["runtime"]["low"],
log:
"logs/create_library_file_adt/{sample}.log",
benchmark:
"results/cellranger_adt/benchmark/{sample}.generate_library_adt.benchmark"
shell:
r"""
echo -e "fastqs,sample,library_type\n{input.fastqs_dir},{wildcards.sample},Antibody Capture" > {output.library_file}
"""


# Cellranger call to process the raw ADT samples
rule cellranger_count_adt:
input:
library=WORKDIR + "/results/cellranger_adt/{sample}.adt_library.txt",
reference=config["resources"]["reference_transcriptome"],
features_ref=getFeatRefFileSimple,
output:
features_file="results/cellranger_adt/{sample}.features.tsv",
matrix_file="results/cellranger_adt/{sample}.matrix.mtx",
barcodes_file="results/cellranger_adt/{sample}.barcodes.tsv",
web_file=report(
"results/cellranger_adt/{sample}.web_summary.html",
caption="../report/cellranger_adt.rst",
category="preprocessing QC",
),
metrics_file="results/cellranger_adt/{sample}.metrics_summary.csv",
params:
cr_out="results/cellranger_adt/",
variousParams=config["tools"]["cellranger_count_adt"]["variousParams"],
targetCells=getTargetCellsCellranger_simple,
sample="{sample}",
threads: config["computingResources"]["threads"]["high"]
log:
"logs/cellranger_count_adt/{sample}.log",
resources:
mem_mb=config["computingResources"]["mem_mb"]["high"],
runtime=config["computingResources"]["runtime"]["high"],
benchmark:
"results/cellranger_adt/benchmark/{sample}.cellranger_count_adt.benchmark"
# NOTE: cellranger count function cannot specify the output directory, the output it the path you call it from.
# Therefore, a subshell is used here.
shell:
"(cd {params.cr_out} ; "
"{config[tools][cellranger_count_adt][call]} count "
"--id={params.sample} "
"--transcriptome={input.reference} "
"--libraries={input.library} "
"--feature-ref={input.features_ref} "
"--nosecondary "
"--localcores={threads} "
"{params.variousParams} "
"{params.targetCells}) "
"&> {log} "
"&& gunzip -c {params.cr_out}{params.sample}/outs/filtered_feature_bc_matrix/features.tsv.gz > {params.cr_out}{params.sample}/outs/filtered_feature_bc_matrix/features.tsv ; "
"gunzip -c {params.cr_out}{params.sample}/outs/filtered_feature_bc_matrix/barcodes.tsv.gz > {params.cr_out}{params.sample}/outs/filtered_feature_bc_matrix/barcodes.tsv ; "
"gunzip -c {params.cr_out}{params.sample}/outs/filtered_feature_bc_matrix/matrix.mtx.gz > {params.cr_out}{params.sample}/outs/filtered_feature_bc_matrix/matrix.mtx "
"&& ln -rs '{params.cr_out}{params.sample}/outs/filtered_feature_bc_matrix/features.tsv' '{output.features_file}' ; "
"ln -rs '{params.cr_out}{params.sample}/outs/filtered_feature_bc_matrix/matrix.mtx' '{output.matrix_file}' ; "
"ln -rs '{params.cr_out}{params.sample}/outs/filtered_feature_bc_matrix/barcodes.tsv' '{output.barcodes_file}' ; "
"ln -rs '{params.cr_out}{params.sample}/outs/web_summary.html' '{output.web_file}' ; "
"ln -rs '{params.cr_out}{params.sample}/outs/metrics_summary.csv' '{output.metrics_file}' "
61 changes: 61 additions & 0 deletions workflow/rules/gex_cellranger_no_hashing.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import os

WORKDIR = os.getcwd()


# Cellranger call to process the raw GEX samples
# input: fastq directory from config file and reference transcriptome read from config file
# NOTE: the fastq directory as set in the config is interpreted to be relative to the gExcite working directory
# output: cellranger files
rule cellranger_count_gex:
input:
fastqs_dir=WORKDIR
+ "/"
+ config["inputOutput"]["input_fastqs_gex"]
+ "{sample}/",
reference=config["resources"]["reference_transcriptome"],
output:
features_file="results/cellranger_gex/{sample}.features.tsv",
matrix_file="results/cellranger_gex/{sample}.matrix.mtx",
barcodes_file="results/cellranger_gex/{sample}.barcodes.tsv",
web_file=report(
"results/cellranger_gex/{sample}.web_summary.html",
caption="../report/cellranger.rst",
category="preprocessing QC",
),
metrics_file="results/cellranger_gex/{sample}.metrics_summary.csv",
params:
cr_out="results/cellranger_gex/",
variousParams=config["tools"]["cellranger_count_gex"]["variousParams"],
targetCells=getTargetCellsCellranger_simple,
mySample="{sample}",
threads: config["computingResources"]["threads"]["high"]
log:
"logs/cellranger_count_gex/{sample}.log",
resources:
mem_mb=config["computingResources"]["mem_mb"]["high"],
runtime=config["computingResources"]["runtime"]["high"],
benchmark:
"results/cellranger_gex/benchmark/{sample}.cellranger_count_gex.benchmark"
# NOTE: cellranger count function cannot specify the output directory, the output is the path you call it from.
# Therefore, a subshell is used here.
# Also, unzip and symlink output files in preparation for downstream steps
shell:
"(cd {params.cr_out}; "
"{config[tools][cellranger_count_gex][call]} count "
"--id={params.mySample} "
"--sample={params.mySample} "
"--transcriptome={input.reference} "
"--fastqs={input.fastqs_dir} "
"--nosecondary "
"--localcores={threads} "
"{params.variousParams}) "
"&> {log} ; "
"gunzip {params.cr_out}{params.mySample}/outs/filtered_feature_bc_matrix/features.tsv.gz ; "
"gunzip {params.cr_out}{params.mySample}/outs/filtered_feature_bc_matrix/barcodes.tsv.gz ; "
"gunzip {params.cr_out}{params.mySample}/outs/filtered_feature_bc_matrix/matrix.mtx.gz ; "
"ln -frs '{params.cr_out}{params.mySample}/outs/filtered_feature_bc_matrix/features.tsv' '{output.features_file}' ; "
"ln -frs '{params.cr_out}{params.mySample}/outs/filtered_feature_bc_matrix/matrix.mtx' '{output.matrix_file}' ; "
"ln -frs '{params.cr_out}{params.mySample}/outs/filtered_feature_bc_matrix/barcodes.tsv' '{output.barcodes_file}' ; "
"ln -frs '{params.cr_out}{params.mySample}/outs/web_summary.html' '{output.web_file}' ; "
"ln -frs '{params.cr_out}{params.mySample}/outs/metrics_summary.csv' '{output.metrics_file}' "

0 comments on commit 9e451f5

Please sign in to comment.