[FEAT] Add new Snakefile and rules for analysis no_hashing

ETH-NEXUS · Aug 30, 2023 · 9e451f5 · 9e451f5
1 parent c26467b
commit 9e451f5
Show file tree

Hide file tree

Showing 3 changed files with 255 additions and 0 deletions.
diff --git a/workflow/Snakefile_no_hashing.smk b/workflow/Snakefile_no_hashing.smk
@@ -0,0 +1,112 @@
+# Define minimum Snakemake Version
+from snakemake.utils import min_version
+
+min_version("6.12.1")
+
+
+# Include Config file
+configfile: "config/config.yaml"
+
+
+# Include report functionality
+report: "../report/workflow.rst"
+
+
+# This file includes common functions used in the pipeline
+include: "rules/misc_snake.smk"
+
+
+## Include scampi as a module
+module scampi:
+    skip_validation:
+        True
+    snakefile:
+        github(
+            "ETH-NEXUS/scAmpi_single_cell_RNA",
+            path="workflow/snakefile_basic.smk",
+            tag="v2.0.7",
+        )
+    config:
+        config["scampi"]
+
+
+## Include the preprocessing rules
+include: "rules/gex_cellranger_no_hashing.smk"
+include: "rules/adt_cellranger_no_hashing.smk"
+## Include scampi
+include: "rules/scampi_module.smk"
+## Include citseq rules
+include: "rules/adt_analyse_citeseq.smk"
+
+
+# This rule defines which files should be created
+localrules:
+    gExcite,
+
+
+rule gExcite:
+    input:
+        # Final files from preprocessing
+        expand(
+            "results/cellranger_gex/{sample}.matrix.mtx",
+            sample=getSimpleSampleNames(),
+        ),
+        expand(
+            "results/cellranger_gex/{sample}.barcodes.tsv",
+            sample=getSimpleSampleNames(),
+        ),
+        expand(
+            "results/cellranger_gex/{sample}.features.tsv",
+            sample=getSimpleSampleNames(),
+        ),
+        expand(
+            "results/cellranger_adt/{sample}.matrix.mtx",
+            sample=getSimpleSampleNames(),
+        ),
+        # List of final files from scampi
+        expand(
+            "results/counts_raw/{sample}.h5",
+            sample=getSimpleSampleNames(),
+        ),
+        expand(
+            "results/counts_filtered/{sample}.doublet_barcodes.txt",
+            sample=getSimpleSampleNames(),
+        ),
+        expand(
+            "results/counts_raw/{sample}.raw.histogram_library_sizes.png",
+            sample=getSimpleSampleNames(),
+        ),
+        expand(
+            "results/counts_corrected/{sample}.corrected.RDS",
+            sample=getSimpleSampleNames(),
+        ),
+        expand(
+            "results/clustering/{sample}.clusters_phenograph.csv",
+            sample=getSimpleSampleNames(),
+        ),
+        expand(
+            "results/gene_exp/{sample}.gene_expression_per_cluster.tsv",
+            sample=getSimpleSampleNames(),
+        ),
+        expand(
+            "results/plotting/{sample}.celltype_barplot.png",
+            sample=getSimpleSampleNames(),
+        ),
+        expand(
+            "results/gsva/{sample}.gsetscore_hm.png",
+            sample=getSimpleSampleNames(),
+        ),
+        # List of final files from citeseq analysis
+        expand(
+            "results/citeseq_analysis/{sample}/{sample}.GEX_cellrangerADT_SCE.RDS",
+            sample=getSimpleSampleNames(),
+        ),
+    output:
+        "results/complete_gExcite.txt",
+    resources:
+        mem_mb=1000,
+        time=1,
+    benchmark:
+        "results/complete_gExcite.benchmark"
+    shell:
+        "date > {output} "
diff --git a/workflow/rules/adt_cellranger_no_hashing.smk b/workflow/rules/adt_cellranger_no_hashing.smk
@@ -0,0 +1,82 @@
+# Rules required for the basic cellranger run of sc ADT data
+
+import os
+
+WORKDIR = os.getcwd()
+
+
+# Preprocessing to generate library file for ADT cellranger from the provided input parameters
+# Library file has a fixed format: fastqs,sample,library_type (with header)
+rule create_library_file_adt:
+    input:
+        fastqs_dir=WORKDIR
+        + "/"
+        + config["inputOutput"]["input_fastqs_adt"]
+        + "{sample}/",
+    output:
+        library_file=WORKDIR + "/results/cellranger_adt/{sample}.adt_library.txt",
+    threads: config["computingResources"]["threads"]["low"]
+    resources:
+        mem_mb=config["computingResources"]["mem_mb"]["low"],
+        runtime=config["computingResources"]["runtime"]["low"],
+    log:
+        "logs/create_library_file_adt/{sample}.log",
+    benchmark:
+        "results/cellranger_adt/benchmark/{sample}.generate_library_adt.benchmark"
+    shell:
+        r"""
+        echo -e "fastqs,sample,library_type\n{input.fastqs_dir},{wildcards.sample},Antibody Capture" > {output.library_file}
+        """
+
+
+# Cellranger call to process the raw ADT samples
+rule cellranger_count_adt:
+    input:
+        library=WORKDIR + "/results/cellranger_adt/{sample}.adt_library.txt",
+        reference=config["resources"]["reference_transcriptome"],
+        features_ref=getFeatRefFileSimple,
+    output:
+        features_file="results/cellranger_adt/{sample}.features.tsv",
+        matrix_file="results/cellranger_adt/{sample}.matrix.mtx",
+        barcodes_file="results/cellranger_adt/{sample}.barcodes.tsv",
+        web_file=report(
+            "results/cellranger_adt/{sample}.web_summary.html",
+            caption="../report/cellranger_adt.rst",
+            category="preprocessing QC",
+        ),
+        metrics_file="results/cellranger_adt/{sample}.metrics_summary.csv",
+    params:
+        cr_out="results/cellranger_adt/",
+        variousParams=config["tools"]["cellranger_count_adt"]["variousParams"],
+        targetCells=getTargetCellsCellranger_simple,
+        sample="{sample}",
+    threads: config["computingResources"]["threads"]["high"]
+    log:
+        "logs/cellranger_count_adt/{sample}.log",
+    resources:
+        mem_mb=config["computingResources"]["mem_mb"]["high"],
+        runtime=config["computingResources"]["runtime"]["high"],
+    benchmark:
+        "results/cellranger_adt/benchmark/{sample}.cellranger_count_adt.benchmark"
+    # NOTE: cellranger count function cannot specify the output directory, the output it the path you call it from.
+    # Therefore, a subshell is used here.
+    shell:
+        "(cd {params.cr_out} ; "
+        "{config[tools][cellranger_count_adt][call]} count "
+        "--id={params.sample} "
+        "--transcriptome={input.reference} "
+        "--libraries={input.library} "
+        "--feature-ref={input.features_ref} "
+        "--nosecondary "
+        "--localcores={threads} "
+        "{params.variousParams} "
+        "{params.targetCells}) "
+        "&> {log} "
+        "&& gunzip -c {params.cr_out}{params.sample}/outs/filtered_feature_bc_matrix/features.tsv.gz > {params.cr_out}{params.sample}/outs/filtered_feature_bc_matrix/features.tsv ; "
+        "gunzip -c {params.cr_out}{params.sample}/outs/filtered_feature_bc_matrix/barcodes.tsv.gz > {params.cr_out}{params.sample}/outs/filtered_feature_bc_matrix/barcodes.tsv ; "
+        "gunzip -c {params.cr_out}{params.sample}/outs/filtered_feature_bc_matrix/matrix.mtx.gz > {params.cr_out}{params.sample}/outs/filtered_feature_bc_matrix/matrix.mtx "
+        "&& ln -rs '{params.cr_out}{params.sample}/outs/filtered_feature_bc_matrix/features.tsv' '{output.features_file}' ; "
+        "ln -rs '{params.cr_out}{params.sample}/outs/filtered_feature_bc_matrix/matrix.mtx' '{output.matrix_file}' ; "
+        "ln -rs '{params.cr_out}{params.sample}/outs/filtered_feature_bc_matrix/barcodes.tsv' '{output.barcodes_file}' ; "
+        "ln -rs '{params.cr_out}{params.sample}/outs/web_summary.html' '{output.web_file}' ; "
+        "ln -rs '{params.cr_out}{params.sample}/outs/metrics_summary.csv' '{output.metrics_file}' "
diff --git a/workflow/rules/gex_cellranger_no_hashing.smk b/workflow/rules/gex_cellranger_no_hashing.smk
@@ -0,0 +1,61 @@
+import os
+
+WORKDIR = os.getcwd()
+
+
+# Cellranger call to process the raw GEX samples
+# input: fastq directory from config file and reference transcriptome read from config file
+# NOTE: the fastq directory as set in the config is interpreted to be relative to the gExcite working directory
+# output: cellranger files
+rule cellranger_count_gex:
+    input:
+        fastqs_dir=WORKDIR
+        + "/"
+        + config["inputOutput"]["input_fastqs_gex"]
+        + "{sample}/",
+        reference=config["resources"]["reference_transcriptome"],
+    output:
+        features_file="results/cellranger_gex/{sample}.features.tsv",
+        matrix_file="results/cellranger_gex/{sample}.matrix.mtx",
+        barcodes_file="results/cellranger_gex/{sample}.barcodes.tsv",
+        web_file=report(
+            "results/cellranger_gex/{sample}.web_summary.html",
+            caption="../report/cellranger.rst",
+            category="preprocessing QC",
+        ),
+        metrics_file="results/cellranger_gex/{sample}.metrics_summary.csv",
+    params:
+        cr_out="results/cellranger_gex/",
+        variousParams=config["tools"]["cellranger_count_gex"]["variousParams"],
+        targetCells=getTargetCellsCellranger_simple,
+        mySample="{sample}",
+    threads: config["computingResources"]["threads"]["high"]
+    log:
+        "logs/cellranger_count_gex/{sample}.log",
+    resources:
+        mem_mb=config["computingResources"]["mem_mb"]["high"],
+        runtime=config["computingResources"]["runtime"]["high"],
+    benchmark:
+        "results/cellranger_gex/benchmark/{sample}.cellranger_count_gex.benchmark"
+    # NOTE: cellranger count function cannot specify the output directory, the output is the path you call it from.
+    # Therefore, a subshell is used here.
+    # Also, unzip and symlink output files in preparation for downstream steps
+    shell:
+        "(cd {params.cr_out}; "
+        "{config[tools][cellranger_count_gex][call]} count "
+        "--id={params.mySample} "
+        "--sample={params.mySample} "
+        "--transcriptome={input.reference} "
+        "--fastqs={input.fastqs_dir} "
+        "--nosecondary "
+        "--localcores={threads} "
+        "{params.variousParams}) "
+        "&> {log} ; "
+        "gunzip {params.cr_out}{params.mySample}/outs/filtered_feature_bc_matrix/features.tsv.gz ; "
+        "gunzip {params.cr_out}{params.mySample}/outs/filtered_feature_bc_matrix/barcodes.tsv.gz ; "
+        "gunzip {params.cr_out}{params.mySample}/outs/filtered_feature_bc_matrix/matrix.mtx.gz ; "
+        "ln -frs '{params.cr_out}{params.mySample}/outs/filtered_feature_bc_matrix/features.tsv' '{output.features_file}' ; "
+        "ln -frs '{params.cr_out}{params.mySample}/outs/filtered_feature_bc_matrix/matrix.mtx' '{output.matrix_file}' ; "
+        "ln -frs '{params.cr_out}{params.mySample}/outs/filtered_feature_bc_matrix/barcodes.tsv' '{output.barcodes_file}' ; "
+        "ln -frs '{params.cr_out}{params.mySample}/outs/web_summary.html' '{output.web_file}' ; "
+        "ln -frs '{params.cr_out}{params.mySample}/outs/metrics_summary.csv' '{output.metrics_file}' "