From ba83def65eb5d46a3bc42267a7e1d49ffcfdfccc Mon Sep 17 00:00:00 2001 From: maxozo Date: Thu, 25 Jan 2024 16:32:15 +0000 Subject: [PATCH 01/16] cleaning up and speeding up pipeline --- bin/filter_bam_file_for_popscle_dsc_pileup.sh | 151 +++++++++++++ bin/sort_vcf_same_as_bam.sh | 206 ++++++++++++++++++ main.nf | 108 +-------- modules/{ => multi}/donor_match.nf | 0 modules/{ => multi}/gene_demulti/bcftools.nf | 0 modules/{ => multi}/gene_demulti/cellsnp.nf | 0 modules/{ => multi}/gene_demulti/demuxlet.nf | 2 + modules/{ => multi}/gene_demulti/freebayes.nf | 0 .../{ => multi}/gene_demulti/freemuxlet.nf | 0 modules/{ => multi}/gene_demulti/samtools.nf | 0 modules/{ => multi}/gene_demulti/scsplit.nf | 0 .../{ => multi}/gene_demulti/souporcell.nf | 0 modules/{ => multi}/gene_demulti/vireo.nf | 0 modules/{ => multi}/gene_demultiplexing.nf | 114 ++++++---- modules/{ => multi}/hash_demulti/bff.nf | 0 modules/{ => multi}/hash_demulti/demuxem.nf | 0 modules/{ => multi}/hash_demulti/gmm_demux.nf | 0 .../{ => multi}/hash_demulti/hashedDrops.nf | 0 modules/{ => multi}/hash_demulti/hashsolo.nf | 0 modules/{ => multi}/hash_demulti/htodemux.nf | 0 modules/{ => multi}/hash_demulti/multiseq.nf | 0 .../{ => multi}/hash_demulti/preprocess.nf | 0 modules/{ => multi}/hash_demulti/solo.nf | 0 modules/{ => multi}/hash_demultiplexing.nf | 0 modules/multi_demultiplexing.nf | 28 ++- modules/single/gene_demulti/demuxlet.nf | 12 +- modules/single/gene_demulti/freebayes.nf | 1 + modules/single/gene_demulti/freemuxlet.nf | 15 +- modules/single/gene_demulti/souporcell.nf | 2 +- modules/single/gene_demulti/vireo.nf | 6 +- modules/single/hash_demultiplexing.nf | 1 + modules/single_demultiplexing.nf | 118 ++++++++++ nextflow.config | 23 +- 33 files changed, 630 insertions(+), 157 deletions(-) create mode 100755 bin/filter_bam_file_for_popscle_dsc_pileup.sh create mode 100755 bin/sort_vcf_same_as_bam.sh rename modules/{ => multi}/donor_match.nf (100%) rename modules/{ => multi}/gene_demulti/bcftools.nf (100%) rename modules/{ => multi}/gene_demulti/cellsnp.nf (100%) rename modules/{ => multi}/gene_demulti/demuxlet.nf (99%) rename modules/{ => multi}/gene_demulti/freebayes.nf (100%) rename modules/{ => multi}/gene_demulti/freemuxlet.nf (100%) rename modules/{ => multi}/gene_demulti/samtools.nf (100%) rename modules/{ => multi}/gene_demulti/scsplit.nf (100%) rename modules/{ => multi}/gene_demulti/souporcell.nf (100%) rename modules/{ => multi}/gene_demulti/vireo.nf (100%) rename modules/{ => multi}/gene_demultiplexing.nf (91%) rename modules/{ => multi}/hash_demulti/bff.nf (100%) rename modules/{ => multi}/hash_demulti/demuxem.nf (100%) rename modules/{ => multi}/hash_demulti/gmm_demux.nf (100%) rename modules/{ => multi}/hash_demulti/hashedDrops.nf (100%) rename modules/{ => multi}/hash_demulti/hashsolo.nf (100%) rename modules/{ => multi}/hash_demulti/htodemux.nf (100%) rename modules/{ => multi}/hash_demulti/multiseq.nf (100%) rename modules/{ => multi}/hash_demulti/preprocess.nf (100%) rename modules/{ => multi}/hash_demulti/solo.nf (100%) rename modules/{ => multi}/hash_demultiplexing.nf (100%) create mode 100644 modules/single_demultiplexing.nf diff --git a/bin/filter_bam_file_for_popscle_dsc_pileup.sh b/bin/filter_bam_file_for_popscle_dsc_pileup.sh new file mode 100755 index 0000000..9cd8f8a --- /dev/null +++ b/bin/filter_bam_file_for_popscle_dsc_pileup.sh @@ -0,0 +1,151 @@ +#!/bin/bash +# +# Copyright (C): 2020-2021 - Gert Hulselmans +# +# Purpose: Filter BAM file for usage with popscle dsc-pileup by keeping reads: +# - which overlap with SNPs in the VCF file +# - and which have a cell barcode (default: "CB" tag) contained in the cell barcode list +# Keeping only relevant reads for popscle dsc-pileup can speedup it up quite significantly +# (depending on the reduction of the number of reads in the filtered BAM file vs original). + + + +# Function to check if any of the programs in a pipe failed. +check_exit_codes () { + local GET_PIPESTATUS="${PIPESTATUS[@]}"; + local exit_code; + + for exit_code in ${GET_PIPESTATUS} ; do + if [ ${exit_code} -ne 0 ] ; then + return ${exit_code}; + fi + done + + return 0; +} + + + +# Check if necessary programs are installed. +check_if_programs_exists () { + local exit_code=0; + + # Check if bedtools is installed. + if ! type bedtools > /dev/null 2>&1 ; then + printf 'Error: "bedtools" could not be found in PATH.\n' > /dev/stderr; + exit_code=2; + fi + + # Check if samtools is installed. + if ! type samtools > /dev/null 2>&1 ; then + printf 'Error: "samtools" could not be found in PATH.\n' > /dev/stderr; + exit_code=2; + fi + + if [ ${exit_code} -eq 2 ] ; then + return ${exit_code}; + fi + + # Check if samtools 1.10 or higher is installed (needs to have "-D STR:FILE" or "-D, --tag-file STR:FILE" option). + if ! samtools view --help 2>&1 | grep -q -- '-D.*STR:FILE' ; then + printf 'Error: The version of "samtools" (%s) should be 1.10 or higher (%s found).\n' \ + "$(type samtools)" \ + "$(samtools --version | head -n 1)" \ + > /dev/stderr; + exit_code=2; + fi + + return ${exit_code}; +} + + + +filter_bam_file_for_popscle_dsc_pileup () { + local input_bam_filename="${1}"; + local barcodes_tsv_filename="${2}"; + local vcf_filename="${3}"; + local output_bam_filename="${4}"; + local barcode_tag="${5:-CB}"; + + local exit_code=0; + + if [ ${#@} -lt 4 ] ; then + printf 'Usage: filter_bam_file_for_popscle_dsc_pileup input_bam_filename barcodes_tsv_filename vcf_filename output_bam_filename [barcode_tag]\n\n'; + printf 'Purpose: Filter BAM file for usage with popscle dsc-pileup by keeping reads:\n'; + printf ' - which overlap with SNPs in the VCF file\n'; + printf ' - and which have a cell barcode (default: "CB" tag) contained in the cell barcode list\n'; + printf ' Keeping only relevant reads for popscle dsc-pileup can speedup it up quite significantly\n'; + printf ' (depending on the reduction of the number of reads in the filtered BAM file vs original).\n\n'; + + return 1; + fi + + if [ ! -f "${input_bam_filename}" ] ; then + printf 'Error: Input (CellRanger) BAM file "%s" could not be found.\n' "${input_bam_filename}" > /dev/stderr; + return 2; + fi + + if [ ! -f "${barcodes_tsv_filename}" ] ; then + printf 'Error: File with barcodes "%s" could not be found.\n' "${barcodes_tsv_filename}" > /dev/stderr; + return 2; + fi + + if [ ! -f "${vcf_filename}" ] ; then + printf 'Error: File with unique SNPs per sample "%s" could not be found.\n' "${vcf_filename}" > /dev/stderr; + return 2; + fi + + if [ ${#barcode_tag} -ne 2 ] ; then + printf 'Error: Barcode tag "%s" should be 2 characters.\n' "${barcode_tag}" > /dev/stderr; + return 2; + fi + + # Check if bedtools and samtools are in PATH. + if ! check_if_programs_exists ; then + return 2; + fi + + # Create much smaller BAM file for dsc-pileup of popscle: + # - Convert VCF file with unique SNPs for each sample + # to a BED file and merge adjacent SNP regions to one. + # - Only include reads that contain a SNP position + # and which contain a cell barcode of interest. + if [ "${barcodes_tsv_filename%.gz}".gz = "${barcodes_tsv_filename}" ] ; then + # Barcodes file is compressed with gzip. + bedtools merge -i "${vcf_filename}" \ + | samtools view\ + -@ 8 \ + --write-index \ + -L - \ + -D "${barcode_tag}":<(zcat "${barcodes_tsv_filename}") \ + -o "${output_bam_filename}" \ + "${input_bam_filename}"; + + # Check if any of the previous commands failed. + check_exit_codes; + + exit_code=$?; + else + # Barcodes file is uncompressed. + bedtools merge -i "${vcf_filename}" \ + | samtools view\ + -@ 8 \ + --write-index \ + -L - \ + -D "${barcode_tag}":"${barcodes_tsv_filename}" \ + -o "${output_bam_filename}" \ + "${input_bam_filename}"; + + # Check if any of the previous commands failed. + check_exit_codes; + + exit_code=$?; + fi + + + return ${exit_code}; +} + + + +filter_bam_file_for_popscle_dsc_pileup "${@}"; diff --git a/bin/sort_vcf_same_as_bam.sh b/bin/sort_vcf_same_as_bam.sh new file mode 100755 index 0000000..9638b4f --- /dev/null +++ b/bin/sort_vcf_same_as_bam.sh @@ -0,0 +1,206 @@ +#!/bin/bash +# +# Copyright (C): 2020-2021 - Gert Hulselmans +# +# Purpose: Sort VCF file in the same order as the BAM file, so it can be used with popscle. + + + +# Function to check if any of the programs in a pipe failed. +check_exit_codes () { + local GET_PIPESTATUS="${PIPESTATUS[@]}"; + local exit_code; + + for exit_code in ${GET_PIPESTATUS} ; do + if [ ${exit_code} -ne 0 ] ; then + return ${exit_code}; + fi + done + + return 0; +} + + + +# Check if necessary programs are installed. +check_if_programs_exists () { + local exit_code=0; + + # Check if awk is installed. + if ! type awk > /dev/null 2>&1 ; then + printf 'Error: "awk" could not be found in PATH.\n' > /dev/stderr; + exit_code=2; + fi + + # Check if bcftools is installed. + if ! type bcftools > /dev/null 2>&1 ; then + printf 'Error: "bcftools" could not be found in PATH.\n' > /dev/stderr; + exit_code=2; + fi + + # Check if samtools is installed. + if ! type samtools > /dev/null 2>&1 ; then + printf 'Error: "samtools" could not be found in PATH.\n' > /dev/stderr; + exit_code=2; + fi + + return ${exit_code}; +} + + + +# Get order of the contigs (chromosomes) and their length from the BAM header. +get_contig_order_from_bam () { + local bam_input_file="${1}"; + local output_type="${2}"; + + if [ ${#@} -ne 2 ] ; then + printf 'Usage: get_contig_order_from_bam BAM_file output_type\n\n'; + printf 'Arguments:\n'; + printf ' - BAM_file: BAM file from which to get the contig order and contig lengths.\n'; + printf ' - output_type:\n'; + printf ' - "names": Return contig names.\n'; + printf ' - "chrom_sizes": Return contig names and contig lengths.\n'; + printf ' - "vcf": Return VCF header section for contigs.\n\n'; + return 1; + fi + + case "${output_type}" in + 'names') + ;; + 'chrom_sizes') + ;; + 'vcf') + ;; + *) + printf 'Error: output_type "%s" is not supported.\n' "${output_type}" > /dev/stderr; + return 1; + ;; + esac + + check_if_programs_exists || return $?; + + # Get the order of the contigs from the BAM header. + samtools view -H "${bam_input_file}" \ + | awk \ + -F '\t' \ + -v output_type="${output_type}" \ + ' + { + # Only look at sequence header fields. + if ($1 == "@SQ") { + contig_idx += 1; + contig_name = ""; + contig_length = ""; + + # Extract contig (chromosome) name and contig (chromosome) length. + for (i = 2; i <= NF; i++) { + if ($i ~ /^SN:/) { + contig_name = substr($i, 4); + } + + if ($i ~ /^LN:/) { + contig_length = substr($i, 4); + } + + # Create contig order to name and contig order to length and vcf contig appings. + contig_idx_to_name[contig_idx] = contig_name; + contig_idx_to_length[contig_idx] = contig_length; + contig_idx_to_vcf_contig[contig_idx] = sprintf("##contig=", contig_name, contig_length); + } + } + } END { + if (contig_idx == 0) { + printf "Error: No \"@SQ\" header line found in BAM file.\n" > "/dev/stderr"; + exit(1); + } else if (output_type == "names") { + contig_names = ""; + + for (contig_idx = 1; contig_idx <= length(contig_idx_to_name); contig_idx++) { + contig_names = contig_names " " contig_idx_to_name[contig_idx]; + } + + # Print all contig names (without leading space). + print substr(contig_names, 2); + } else if (output_type == "chrom_sizes") { + # Print all contig names with their length in a TAB separated fashion. + for (contig_idx = 1; contig_idx <= length(contig_idx_to_name); contig_idx++) { + print contig_idx_to_name[contig_idx] "\t" contig_idx_to_length[contig_idx]; + } + } else if (output_type == "vcf") { + # Print VCF header section for contigs. + for (contig_idx = 1; contig_idx <= length(contig_idx_to_vcf_contig); contig_idx++) { + print contig_idx_to_vcf_contig[contig_idx]; + } + } + }' + + check_exit_codes; + + return $?; +} + + + +# Sort VCF file in the same order as the BAM file, so it can be used with popscle. +sort_vcf_same_as_bam () { + local bam_input_file="${1}"; + local vcf_input_file="${2}"; + local vcf_type="${3:-v}"; + + if [ ${#@} -lt 2 ] ; then + printf 'Usage: sort_vcf_same_as_bam BAM_file VCF_file [VCF_type]\n\n'; + printf 'Arguments:\n'; + printf ' - BAM_file: BAM file from which to get the contig order to sort the VCF file.\n'; + printf ' - VCF_file: VCF file to sort by contig order as defined in the BAM file.\n'; + printf ' - VCF_type: VCF ouput file type (default: same as input VCF file type):\n'; + printf ' v: uncompressed VCF, z: compressed VCF,\n'; + printf ' u: uncompressed BCF, b: compressed BCF\n\n'; + printf 'Purpose:\n'; + printf ' Sort VCF file in the same order as the BAM file, so it can be used with popscle.\n\n'; + return 1; + fi + + check_if_programs_exists || return $?; + + # If VCF type is not specified, try to guess it from the filename extension. + if [ ${#@} -eq 2 ] ; then + if [ "${vcf_input_file%.vcf.gz}" != "${vcf_input_file}" ] ; then + vcf_type='z'; + elif [ "${vcf_input_file%.bcf}" != "${vcf_input_file}" ] ; then + vcf_type='b'; + fi + fi + + # Sort VCF file by same chromosome order as BAM file. + cat <( + # Create new VCF header: + # - Get VCF header of VCF input file. + # - Remove all contig header lines and "#CHROM" line from the VCF header. + # - Append contig headers in the order they appear in the input BAM file. + # - Add "#CHROM" line as last line of the new VCF header. + bcftools view -h "${vcf_input_file}" \ + | awk \ + ' + { + if ($1 !~ /^##contig=/ && $1 !~ /^#CHROM/) { + # Remove all contig header lines and "#CHROM" line. + print $0; + } + }' \ + | cat \ + - \ + <(get_contig_order_from_bam "${bam_input_file}" 'vcf') \ + <(bcftools view -h "${vcf_input_file}" | tail -n 1) \ + ) \ + <(bcftools view -H -O v "${vcf_input_file}") \ + | bcftools sort -O "${vcf_type}"; + + check_exit_codes; + + return $?; +} + + + +sort_vcf_same_as_bam "${@}"; diff --git a/main.nf b/main.nf index c455a47..4bdd4f6 100644 --- a/main.nf +++ b/main.nf @@ -1,111 +1,17 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=2 -include { run_multi } from './modules/multi_demultiplexing' -include { gene_demultiplexing } from './modules/single/gene_demultiplexing' -include { hash_demultiplexing } from './modules/single/hash_demultiplexing' -include { donor_match } from './modules/single/donor_match' - -process summary_all{ - publishDir "$projectDir/$params.outdir/$params.mode", mode: 'copy' - label 'small_mem' - - conda "pandas scanpy mudata" - - input: - path gene_demulti_result - path hash_demulti_result - output: - path "summary" - - script: - """ - summary.py --gene_demulti $gene_demulti_result --hash_demulti $hash_demulti_result - """ -} - -process generate_data{ - publishDir "$projectDir/$params.outdir/$params.mode/data_output", mode: 'copy' - - conda "pandas scanpy mudata" - - input: - path assignment - val generate_anndata - val generate_mudata - val rna_matrix - val hto_matrix - output: - path "adata_with_donor_matching.h5ad", optional: true - path "mudata_with_donor_matching.h5mu", optional: true - - script: - def generate_adata = "" - def generate_mdata = "" - - if (generate_anndata == "True"){ - if(rna_matrix == "None"){ - error "Error: RNA count matrix is not given." - } - generate_adata = "--generate_anndata --read_rna_mtx $rna_matrix" - } - if (generate_mudata == "True"){ - if(rna_matrix == "None"){ - error "Error: RNA count matrix is not given." - } - if(hto_matrix == "None"){ - error "Error: HTO count matrix is not given." - } - generate_mdata = "--generate_mudata --read_rna_mtx $rna_matrix --read_hto_mtx $hto_matrix" - } - - """ - generate_data.py --assignment $assignment $generate_adata $generate_mdata - """ -} - -workflow run_single{ - if (params.mode == "genetic"){ - gene_demultiplexing() - if (params.match_donor == "True"){ - donor_match(gene_demultiplexing.out) - } - } - else if (params.mode == "hashing"){ - print("Running single sample") - hash_demultiplexing(params.rna_matrix_raw, params.rna_matrix_filtered, params.hto_matrix_raw, params.hto_matrix_filtered) - if (params.match_donor == "True"){ - donor_match(hash_demultiplexing.out) - } - } - else if (params.mode == "rescue"){ - hash_demultiplexing(params.rna_matrix_raw, params.rna_matrix_filtered, params.hto_matrix_raw, params.hto_matrix_filtered) - gene_demultiplexing() - gene_summary = gene_demultiplexing.out - hash_summary = hash_demultiplexing.out - summary_all(gene_summary, hash_summary) - if (params.match_donor == "True"){ - donor_match(summary_all.out) - if (params.generate_anndata == "True" || params.generate_mudata == "True" ){ - generate_data(donor_match.out, params.generate_anndata, params.generate_mudata, - params.rna_matrix_filtered, params.hto_matrix_filtered) - } - } - } - else if (params.mode == "donor_match"){ - donor_match(params.demultiplexing_result) - if (params.generate_anndata == "True" || params.generate_mudata == "True" ){ - generate_data(donor_match.out, params.generate_anndata, params.generate_mudata, - params.rna_matrix_filtered, params.hto_matrix_filtered) - } - } -} +nextflow.enable.dsl=2 +include { run_multi } from "$projectDir/modules/multi_demultiplexing" +include {run_single} from "$projectDir/modules/single_demultiplexing" workflow { + // Here we decide if it is a single sample demultiplexing or multi input demutliplexing run. if (params.multi_input == null){ + // Single Mode run_single() } else{ + // Multi mode run_multi() } -} \ No newline at end of file +} diff --git a/modules/donor_match.nf b/modules/multi/donor_match.nf similarity index 100% rename from modules/donor_match.nf rename to modules/multi/donor_match.nf diff --git a/modules/gene_demulti/bcftools.nf b/modules/multi/gene_demulti/bcftools.nf similarity index 100% rename from modules/gene_demulti/bcftools.nf rename to modules/multi/gene_demulti/bcftools.nf diff --git a/modules/gene_demulti/cellsnp.nf b/modules/multi/gene_demulti/cellsnp.nf similarity index 100% rename from modules/gene_demulti/cellsnp.nf rename to modules/multi/gene_demulti/cellsnp.nf diff --git a/modules/gene_demulti/demuxlet.nf b/modules/multi/gene_demulti/demuxlet.nf similarity index 99% rename from modules/gene_demulti/demuxlet.nf rename to modules/multi/gene_demulti/demuxlet.nf index 2d10b8e..7faecca 100755 --- a/modules/gene_demulti/demuxlet.nf +++ b/modules/multi/gene_demulti/demuxlet.nf @@ -107,6 +107,8 @@ workflow demultiplex_demuxlet{ take: input_list main: + + tag_group = params.tag_group tag_UMI = params.tag_UMI sm = params.sm diff --git a/modules/gene_demulti/freebayes.nf b/modules/multi/gene_demulti/freebayes.nf similarity index 100% rename from modules/gene_demulti/freebayes.nf rename to modules/multi/gene_demulti/freebayes.nf diff --git a/modules/gene_demulti/freemuxlet.nf b/modules/multi/gene_demulti/freemuxlet.nf similarity index 100% rename from modules/gene_demulti/freemuxlet.nf rename to modules/multi/gene_demulti/freemuxlet.nf diff --git a/modules/gene_demulti/samtools.nf b/modules/multi/gene_demulti/samtools.nf similarity index 100% rename from modules/gene_demulti/samtools.nf rename to modules/multi/gene_demulti/samtools.nf diff --git a/modules/gene_demulti/scsplit.nf b/modules/multi/gene_demulti/scsplit.nf similarity index 100% rename from modules/gene_demulti/scsplit.nf rename to modules/multi/gene_demulti/scsplit.nf diff --git a/modules/gene_demulti/souporcell.nf b/modules/multi/gene_demulti/souporcell.nf similarity index 100% rename from modules/gene_demulti/souporcell.nf rename to modules/multi/gene_demulti/souporcell.nf diff --git a/modules/gene_demulti/vireo.nf b/modules/multi/gene_demulti/vireo.nf similarity index 100% rename from modules/gene_demulti/vireo.nf rename to modules/multi/gene_demulti/vireo.nf diff --git a/modules/gene_demultiplexing.nf b/modules/multi/gene_demultiplexing.nf similarity index 91% rename from modules/gene_demultiplexing.nf rename to modules/multi/gene_demultiplexing.nf index 938eba7..80c0ab9 100644 --- a/modules/gene_demultiplexing.nf +++ b/modules/multi/gene_demultiplexing.nf @@ -1,5 +1,6 @@ #!/usr/bin/env nextflow nextflow.enable.dsl=2 + include { data_preprocess } from './gene_demulti/samtools' include { filter_variant } from './gene_demulti/bcftools' include { variant_cellSNP } from './gene_demulti/cellsnp' @@ -10,6 +11,15 @@ include { demultiplex_scSplit } from './gene_demulti/scsplit' include { demultiplex_souporcell } from './gene_demulti/souporcell' include { demultiplex_vireo } from './gene_demulti/vireo' +def split_input(input){ + if (input =~ /;/ ){ + Channel.from(input).map{ return it.tokenize(';')}.flatten() + } + else{ + Channel.from(input) + } +} + process summary{ publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti", mode: 'copy' label 'small_mem' @@ -78,30 +88,29 @@ process summary{ """ } -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() - } - else{ - Channel.from(input) - } -} workflow gene_demultiplexing { - if ((params.demuxlet == "True" & params.demuxlet_preprocess == "True")| \ - (params.freemuxlet == "True" & params.freemuxlet_preprocess == "True")| \ - (params.scSplit == "True" & params.scSplit_preprocess == "True") | \ - (params.vireo == "True" & params.vireo_preprocess == "True") | \ - (params.souporcell == "True" & params.souporcell_preprocess == "True")){ - Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.bam)} - | data_preprocess - qc_bam = data_preprocess.out.map{ it -> tuple( it.name.tokenize( '_' ).last(), it + "/sorted.bam", it + "/sorted.bam.bai") } - } + if ((params.demuxlet == "True" & params.demuxlet_preprocess == "True") | \ + (params.freemuxlet == "True" & params.freemuxlet_preprocess == "True") | \ + (params.scSplit == "True" & params.scSplit_preprocess == "True") | \ + (params.vireo == "True" & params.vireo_preprocess == "True") | \ + (params.souporcell == "True" & params.souporcell_preprocess == "True")) { + + Channel.fromPath(params.multi_input) \ + | splitCsv(header:true) \ + | map { row-> tuple(row.sampleId, row.bam)} + | data_preprocess + qc_bam = data_preprocess.out.map{ it -> tuple( it.name.tokenize( '_' ).last(), it + "/sorted.bam", it + "/sorted.bam.bai") } + } + + + ////////// + //FreeBayes/ scSplit + ////////// if (params.scSplit == "True" & params.scSplit_variant == 'True'){ + freebayes_region = Channel.from(1..22, "X","Y").flatten() if (params.region != "None"){ freebayes_region = split_input(params.region) @@ -117,27 +126,10 @@ workflow gene_demultiplexing { } filter_variant(variant_freebayes.out) freebayes_vcf = filter_variant.out.map{ it -> tuple(it[0], it[1] + "/filtered_sorted_total_chroms.vcf")} - - } - - if (params.vireo == "True" & params.vireo_variant == 'True'){ - if(params.vireo_preprocess == 'True'){ - input_param_cellsnp = Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.barcodes) } - qc_bam_new = qc_bam.join(input_param_cellsnp) - variant_cellSNP(qc_bam_new) - } - else{ - Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.bam, row.bam_index, row.barcodes)} - | variant_cellSNP - } - cellsnp_vcf = variant_cellSNP.out.map{ it -> tuple( it.name.tokenize( '_' ).last(), it + "/*/cellSNP.cells.vcf") } } if (params.scSplit == "True"){ + if (params.scSplit_preprocess == 'False'){ input_bam_scsplit = Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ @@ -146,14 +138,17 @@ workflow gene_demultiplexing { else{ input_bam_scsplit = qc_bam } + if (params.scSplit_variant == 'True'){ input_vcf_scsplit = freebayes_vcf } else{ + input_vcf_scsplit = Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ | map { row-> tuple(row.sampleId, row.vcf_mixed)} } + // here if there are genotypes provided we need to ensure bam is sorted corectly and is subsampled on the regions needed. input_param_scsplit = Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ | map { row-> tuple(row.sampleId, row.barcodes, row.nsample, row.vcf_donor)} @@ -167,7 +162,31 @@ workflow gene_demultiplexing { scSplit_out = channel.value("no_result") } + + ////////// + //CellSNP/Vireo + ////////// + if (params.vireo == "True" & params.vireo_variant == 'True'){ + + if(params.vireo_preprocess == 'True'){ + input_param_cellsnp = Channel.fromPath(params.multi_input) \ + | splitCsv(header:true) \ + | map { row-> tuple(row.sampleId, row.barcodes) } + qc_bam_new = qc_bam.join(input_param_cellsnp) + variant_cellSNP(qc_bam_new) + } + else{ + Channel.fromPath(params.multi_input) \ + | splitCsv(header:true) \ + | map { row-> tuple(row.sampleId, row.bam, row.bam_index, row.barcodes)} + | variant_cellSNP + } + cellsnp_vcf = variant_cellSNP.out.map{ it -> tuple( it.name.tokenize( '_' ).last(), it + "/*/cellSNP.cells.vcf") } + + } + if (params.vireo == "True"){ + if (params.vireo_variant == 'True'){ input_vcf_vireo = cellsnp_vcf } @@ -188,6 +207,11 @@ workflow gene_demultiplexing { vireo_out = channel.value("no_result") } + + ////////// + //Demuxlet + ////////// + if (params.demuxlet == "True"){ if (params.demuxlet_preprocess == 'False'){ input_bam_demuxlet = Channel.fromPath(params.multi_input) \ @@ -208,6 +232,11 @@ workflow gene_demultiplexing { demuxlet_out = channel.value("no_result") } + + ////////// + //Freemuxlet + ////////// + if (params.freemuxlet == "True"){ if (params.freemuxlet_preprocess == 'False'){ input_bam_freemuxlet = Channel.fromPath(params.multi_input) \ @@ -228,6 +257,11 @@ workflow gene_demultiplexing { freemuxlet_out = channel.value("no_result") } + + ////////// + //Souporcell + ////////// + if (params.souporcell == "True"){ if (params.souporcell_preprocess == 'False'){ input_bam_souporcell = Channel.fromPath(params.multi_input) \ @@ -240,6 +274,7 @@ workflow gene_demultiplexing { input_param_souporcell = Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ | map { row-> tuple(row.sampleId, row.barcodes, row.nsample, row.vcf_donor)} + input_list_souporcell = input_bam_souporcell.join(input_param_souporcell) demultiplex_souporcell(input_list_souporcell) souporcell_out = demultiplex_souporcell.out @@ -248,10 +283,15 @@ workflow gene_demultiplexing { souporcell_out = channel.value("no_result") } + ////////// + //Summary + ////////// + Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ | map { row-> tuple(row.sampleId, file(row.hto_matrix_filtered), file(row.rna_matrix_filtered))} | set {input_list_summary} + summary(input_list_summary, demuxlet_out, freemuxlet_out, vireo_out, souporcell_out, scSplit_out, params.generate_anndata, params.generate_mudata) diff --git a/modules/hash_demulti/bff.nf b/modules/multi/hash_demulti/bff.nf similarity index 100% rename from modules/hash_demulti/bff.nf rename to modules/multi/hash_demulti/bff.nf diff --git a/modules/hash_demulti/demuxem.nf b/modules/multi/hash_demulti/demuxem.nf similarity index 100% rename from modules/hash_demulti/demuxem.nf rename to modules/multi/hash_demulti/demuxem.nf diff --git a/modules/hash_demulti/gmm_demux.nf b/modules/multi/hash_demulti/gmm_demux.nf similarity index 100% rename from modules/hash_demulti/gmm_demux.nf rename to modules/multi/hash_demulti/gmm_demux.nf diff --git a/modules/hash_demulti/hashedDrops.nf b/modules/multi/hash_demulti/hashedDrops.nf similarity index 100% rename from modules/hash_demulti/hashedDrops.nf rename to modules/multi/hash_demulti/hashedDrops.nf diff --git a/modules/hash_demulti/hashsolo.nf b/modules/multi/hash_demulti/hashsolo.nf similarity index 100% rename from modules/hash_demulti/hashsolo.nf rename to modules/multi/hash_demulti/hashsolo.nf diff --git a/modules/hash_demulti/htodemux.nf b/modules/multi/hash_demulti/htodemux.nf similarity index 100% rename from modules/hash_demulti/htodemux.nf rename to modules/multi/hash_demulti/htodemux.nf diff --git a/modules/hash_demulti/multiseq.nf b/modules/multi/hash_demulti/multiseq.nf similarity index 100% rename from modules/hash_demulti/multiseq.nf rename to modules/multi/hash_demulti/multiseq.nf diff --git a/modules/hash_demulti/preprocess.nf b/modules/multi/hash_demulti/preprocess.nf similarity index 100% rename from modules/hash_demulti/preprocess.nf rename to modules/multi/hash_demulti/preprocess.nf diff --git a/modules/hash_demulti/solo.nf b/modules/multi/hash_demulti/solo.nf similarity index 100% rename from modules/hash_demulti/solo.nf rename to modules/multi/hash_demulti/solo.nf diff --git a/modules/hash_demultiplexing.nf b/modules/multi/hash_demultiplexing.nf similarity index 100% rename from modules/hash_demultiplexing.nf rename to modules/multi/hash_demultiplexing.nf diff --git a/modules/multi_demultiplexing.nf b/modules/multi_demultiplexing.nf index cfbeafb..8f9f954 100644 --- a/modules/multi_demultiplexing.nf +++ b/modules/multi_demultiplexing.nf @@ -1,8 +1,10 @@ #!/usr/bin/env nextflow nextflow.enable.dsl=2 -include { hash_demultiplexing } from './hash_demultiplexing' -include { gene_demultiplexing } from './gene_demultiplexing' -include { donor_match } from './donor_match' + +include { hash_demultiplexing } from "$projectDir/modules/multi/hash_demultiplexing" +include { gene_demultiplexing } from "$projectDir/modules/multi/gene_demultiplexing" +include { donor_match } from "$projectDir/modules/multi/donor_match" + process generate_data{ publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/data_output", mode: 'copy' @@ -62,9 +64,18 @@ process summary_all{ } workflow run_multi{ + + print("-----Running multiple samples-----") + if (params.mode == "genetic"){ + + // Performing genetic demultiplexing methodologies gene_demultiplexing() + //////////// + if (params.match_donor == "True"){ + + gene_demultiplexing.out.view() Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ @@ -74,7 +85,11 @@ workflow run_multi{ } } else if (params.mode == "hashing"){ + + // Performing hashing demultplexing hash_demultiplexing() + //////////// + if (params.match_donor == "True"){ Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ @@ -84,12 +99,17 @@ workflow run_multi{ } } else if (params.mode == "rescue"){ + + // Performing both hashing and genetic demultiplexing methods hash_demultiplexing() gene_demultiplexing() + //////////// + gene_summary = gene_demultiplexing.out hash_summary = hash_demultiplexing.out input_summary_all = gene_summary.join(hash_summary) summary_all(input_summary_all) + if (params.match_donor == "True"){ Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ @@ -107,6 +127,8 @@ workflow run_multi{ } } else if (params.mode == "donor_match"){ + + // Performing just donor matching Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ | map { row -> tuple(row.sampleId, row.nsample, row.barcodes, row.celldata, row.vireo_parent_dir, row.demultiplexing_result)} \ diff --git a/modules/single/gene_demulti/demuxlet.nf b/modules/single/gene_demulti/demuxlet.nf index 208ee23..4cfab8a 100755 --- a/modules/single/gene_demulti/demuxlet.nf +++ b/modules/single/gene_demulti/demuxlet.nf @@ -5,8 +5,8 @@ nextflow.enable.dsl=2 process demuxlet { publishDir "$projectDir/$params.outdir/$params.mode/gene_demulti/demuxlet", mode: 'copy' label 'small_mem' - - conda "bioconda::popscle" + + conda "bioconda::popscle bioconda::samtools bioconda::bedtools " input: each sam @@ -51,8 +51,9 @@ process demuxlet { def samfile = "--sam $sam" def taggroup = tag_group != 'None' ? "--tag-group ${tag_group}" : '' def tagUMI = tag_UMI != 'None' ? "--tag-UMI ${tag_UMI}" : '' - def vcfref = plp == 'True' ? "--vcf ${vcf_donor}" : "" - def vcfref_name = plp == 'True' ? vcf_donor : "No VCF Ref is used because plp is not performed." + def vcfref = plp == 'True' ? "--vcf vcfref.vcf" : "" + def vcfref_sort ="sort_vcf_same_as_bam.sh ${sam} ${vcf_donor} > vcfref.vcf" + def vcfref_name = plp == 'True' ? 'vcfref.vcf' : "No VCF Ref is used because plp is not performed." def smlist = sm != 'None' ? "--sm $sm" : '' def sm_list_file = sm_list != 'None' ? "--sm-list ${sm_list}" : '' def sm_list_file_name = sm_list != 'None' ? file(sm_list).baseName : "No sm list file is given" @@ -70,7 +71,7 @@ process demuxlet { def minuniq = "--min-uniq ${min_uniq}" def minsnp = "--min-snp ${min_snp}" def plp_name = plp == 'True' ? "plp performed" : "plp not performed" - def vcfdonor = "--vcf ${vcf_donor}" + def vcfdonor = "--vcf vcfref.vcf" def fieldinfo = "--field $field" def genoerror_off = "--geno-error-offset ${geno_error_offset}" def genoerror_cof = "--geno-error-coeff ${geno_error_coeff}" @@ -81,6 +82,7 @@ process demuxlet { def doubletprior = "--doublet-prior ${doublet_prior}" """ + ${vcfref_sort} mkdir demuxlet_${task.index} touch demuxlet_${task.index}/params.csv barcode_num=\$(wc -l < "${group_list}") diff --git a/modules/single/gene_demulti/freebayes.nf b/modules/single/gene_demulti/freebayes.nf index 930448c..6a3fa26 100644 --- a/modules/single/gene_demulti/freebayes.nf +++ b/modules/single/gene_demulti/freebayes.nf @@ -170,6 +170,7 @@ process freebayes{ def dd = dd_freebayes != 'False' ? "-dd" : '' """ + freebayes ${bam_freebayes} $stdin -f ${ref_freebayes} $targets $region $samples $populations ${cnv_map} \ -v ${region_freebayes}_${vcf_freebayes} $gvcf ${gvcf_chunk} ${gvcf_dont_use_chunk} ${variant_input} ${only_use_input_alleles} ${haplotype_basis_alleles} ${report_all_haplotype_alleles} ${report_monomorphic} $pvar ${strict_vcf} \ $theta $ploidy ${pooled_discrete} ${pooled_continuous} \ diff --git a/modules/single/gene_demulti/freemuxlet.nf b/modules/single/gene_demulti/freemuxlet.nf index e006d2a..bc22768 100755 --- a/modules/single/gene_demulti/freemuxlet.nf +++ b/modules/single/gene_demulti/freemuxlet.nf @@ -5,10 +5,11 @@ process freemuxlet { publishDir "$projectDir/$params.outdir/$params.mode/gene_demulti/freemuxlet", mode: 'copy' label 'small_mem' - conda "bioconda::popscle" + conda "bioconda::popscle bioconda::samtools bioconda::bedtools bioconda::bcftools=1.9" input: each sam + path(barcodes) each vcf each tag_group each tag_UMI @@ -42,10 +43,10 @@ process freemuxlet { path "freemuxlet_${task.index}" script: - def samfile = "--sam $sam" + def samfile = "--sam filtered_bam_file.bam" def taggroup = tag_group != 'None' ? "--tag-group ${tag_group}" : '' def tagUMI = tag_UMI != 'None' ? "--tag-UMI ${tag_UMI}" : '' - def vcffile = "--vcf $vcf" + def vcffile = "--vcf samples.sorted_as_in_bam.vcf" def smlist = sm != 'None' ? "--sm $sm" : '' def sm_list_file = sm_list != 'None' ? "--sm-list ${sm_list}" : '' def sm_list_file_name = sm_list != 'None' ? file(sm_list).baseName : "No sm list file is given" @@ -73,10 +74,14 @@ process freemuxlet { def keepinit_missing = keep_init_missing != "False" ? "--keep-init-missing" : '' """ + echo 'test5' + filter_bam_file_for_popscle_dsc_pileup.sh ${sam} ${barcodes} ${vcf} filtered_bam_file.bam + sort_vcf_same_as_bam.sh filtered_bam_file.bam ${vcf} > samples.sorted_as_in_bam.vcf + mkdir freemuxlet_${task.index} mkdir freemuxlet_${task.index}/plp touch freemuxlet_${task.index}/params.csv - echo -e "Argument,Value \n samfile,${sam} \n tag_group,${tag_group} \n tag_UMI,${tag_UMI} \n vcf_file,${vcf} \n sm,${sm} \n sm_list_file,${sm_list_file_name} \n sam_verbose,${sam_verbose} \n vcf_verbose,${vcf_verbose} \n skip_umi,${skip_umi} \n cap_BQ,${cap_BQ} \n min_BQ,${min_BQ} \n min_MQ,${min_MQ} \n min_TD,${min_TD} \n excl_flag,${excl_flag} \n grouplist,${group_list} \n min_total,${min_total} \n min_uniq,${min_uniq} \n min_umi,${min_umi} \n min_snp,${min_snp} \n init_cluster,${init_cluster} \n nsample,${nsample} \n aux_files,${aux_files} \n verbose,${verbose} \n doublet_prior,${doublet_prior} \n bf_thres,${bf_thres} \n frac_init_clust,${frac_init_clust} \n iter_init,${iter_init} \n keep_init_missing,${keep_init_missing}" >> freemuxlet_${task.index}/params.csv + echo -e "Argument,Value \n samfile,filtered_bam_file.bam \n tag_group,${tag_group} \n tag_UMI,${tag_UMI} \n vcf_file,samples.sorted_as_in_bam.vcf \n sm,${sm} \n sm_list_file,${sm_list_file_name} \n sam_verbose,${sam_verbose} \n vcf_verbose,${vcf_verbose} \n skip_umi,${skip_umi} \n cap_BQ,${cap_BQ} \n min_BQ,${min_BQ} \n min_MQ,${min_MQ} \n min_TD,${min_TD} \n excl_flag,${excl_flag} \n grouplist,${group_list} \n min_total,${min_total} \n min_uniq,${min_uniq} \n min_umi,${min_umi} \n min_snp,${min_snp} \n init_cluster,${init_cluster} \n nsample,${nsample} \n aux_files,${aux_files} \n verbose,${verbose} \n doublet_prior,${doublet_prior} \n bf_thres,${bf_thres} \n frac_init_clust,${frac_init_clust} \n iter_init,${iter_init} \n keep_init_missing,${keep_init_missing}" >> freemuxlet_${task.index}/params.csv popscle dsc-pileup $samfile ${taggroup} ${tagUMI} $vcffile ${smlist} ${sm_list_file} ${samverbose} \ ${vcfverbose} ${skipumi} ${capBQ} ${minBQ} ${minMQ} ${minTD} ${exclflag} ${grouplist} ${mintotal} ${minuniq} \ @@ -134,7 +139,7 @@ workflow demultiplex_freemuxlet{ keep_init_missing = split_input(params.keep_init_missing) freemuxlet_out = params.freemuxlet_out - freemuxlet(sam, vcf, tag_group, tag_UMI, sm, sm_list, sam_verbose, vcf_verbose, skip_umi, cap_BQ, + freemuxlet(sam, params.barcodes, vcf, tag_group, tag_UMI, sm, sm_list, sam_verbose, vcf_verbose, skip_umi, cap_BQ, min_BQ, min_MQ, min_TD, excl_flag, group_list, min_total, min_uniq, min_umi, min_snp, init_cluster, nsample, aux_files, verbose, doublet_prior, bf_thres, frac_init_clust, iter_init, keep_init_missing, freemuxlet_out) diff --git a/modules/single/gene_demulti/souporcell.nf b/modules/single/gene_demulti/souporcell.nf index 72cd131..2ad7fb9 100755 --- a/modules/single/gene_demulti/souporcell.nf +++ b/modules/single/gene_demulti/souporcell.nf @@ -59,7 +59,7 @@ process souporcell{ mkdir $out touch souporcell_${task.index}/params.csv echo -e "Argument,Value \n bamfile,${bam} \n barcode,${barcodes} \n fasta,${fasta} \n threads,${threads} \n clusters,${clusters} \n ploidy,${ploidy} \n min_alt,${min_alt} \n min_ref,${min_ref} \n max_loci,${max_loci} \n restarts,${restarts} \n common_variant,${commonvariant_name} \n known_genotype,${knowngenotype_name} \n known_genotype_sample,${knowngenotype_sample_name} \n skip_remap,${skip_remap} \n ignore,${ignore} " >> souporcell_${task.index}/params.csv - souporcell_pipeline.py $bamfile $barcode $fastafile $thread $cluster $ploi $minalt $minref $maxloci $restart $commonvariant $knowngenotype $knowngenotypes_sample $skipremap $ign -o $out + souporcell_pipeline.py --threads ${task.cpus} $bamfile $barcode $fastafile $thread $cluster $ploi $minalt $minref $maxloci $restart $commonvariant $knowngenotype $knowngenotypes_sample $skipremap $ign -o $out """ } diff --git a/modules/single/gene_demulti/vireo.nf b/modules/single/gene_demulti/vireo.nf index 55c8cbd..fbe383b 100755 --- a/modules/single/gene_demulti/vireo.nf +++ b/modules/single/gene_demulti/vireo.nf @@ -35,7 +35,8 @@ process vireo{ def cell_data = "-c $celldata" def n_donor = ndonor != 'None'? "-N $ndonor" : '' def n_donor_yesno = ndonor != 'None'? "$ndonor" : "Number of donors are not given" - def donor = donorfile != 'None' ? "-d $donorfile" : '' + def donor = donorfile != 'None' ? "-d no_prefix.vcf" : '' + def donor_no_chr_cmd = donorfile != 'None' ? "zcat $donorfile | awk '{gsub(/^chr/,\"\"); print}' | awk '{gsub(/ID=chr/,\"ID=\"); print}' > no_prefix.vcf" : '' def donor_data_name = donorfile != 'None' ? donorfile : 'Donor file is not given' def geno_tag = donorfile != 'None' ? "--genoTag $genoTag" : '' def no_doublet = noDoublet != 'False' ? "--noDoublet" : '' @@ -55,7 +56,10 @@ process vireo{ mkdir vireo_${task.index} mkdir vireo_${task.index}/${vireo_out} touch vireo_${task.index}/params.csv + + ${donor_no_chr_cmd} echo -e "Argument,Value \n cell_data,${celldata} \n n_donor,${n_donor_yesno} \n donor_data,${donor_data_name} \n genoTag,${genoTag} \n noDoublet,${noDoublet} \n nInit,${nInit} \n extraDonor,${extraDonor} \n extraDonorMode,${extraDonorMode} \n learnGT,${learnGT_yesno} \n ASEmode,${ASEmode} \n noPlot,${noPlot} \n randSeed,${randSeed} \n cellRange,${cellRange} \n callAmbientRNAs,${callAmbientRNAs} \n nproc,${nproc}" >> vireo_${task.index}/params.csv + vireo ${cell_data} ${n_donor} $donor ${geno_tag} ${no_doublet} ${n_init} ${extra_donor} ${extradonor_mode} \ $learnGT ${ase_mode} ${no_plot} ${random_seed} ${cell_range} ${call_ambient_rna} ${n_proc} \ -o vireo_${task.index}/${vireo_out} diff --git a/modules/single/hash_demultiplexing.nf b/modules/single/hash_demultiplexing.nf index fc3592a..e96f6f1 100644 --- a/modules/single/hash_demultiplexing.nf +++ b/modules/single/hash_demultiplexing.nf @@ -1,5 +1,6 @@ #!/usr/bin/env nextflow nextflow.enable.dsl=2 + include { preprocessing_hashing as preprocessing_hashing_htodemux } from './hash_demulti/preprocess' include { preprocessing_hashing as preprocessing_hashing_multiseq } from './hash_demulti/preprocess' include { multiseq_hashing } from './hash_demulti/multiseq' diff --git a/modules/single_demultiplexing.nf b/modules/single_demultiplexing.nf new file mode 100644 index 0000000..16959e5 --- /dev/null +++ b/modules/single_demultiplexing.nf @@ -0,0 +1,118 @@ +#!/usr/bin/env nextflow +nextflow.enable.dsl=2 + +include { gene_demultiplexing } from "$projectDir/modules/single/gene_demultiplexing" +include { hash_demultiplexing } from "$projectDir/modules/single/hash_demultiplexing" +include { donor_match } from "$projectDir/modules/single/donor_match" + + +process generate_data{ + publishDir "$projectDir/$params.outdir/$params.mode/data_output", mode: 'copy' + + conda "pandas scanpy mudata" + + input: + path assignment + val generate_anndata + val generate_mudata + val rna_matrix + val hto_matrix + output: + path "adata_with_donor_matching.h5ad", optional: true + path "mudata_with_donor_matching.h5mu", optional: true + + + script: + def generate_adata = "" + def generate_mdata = "" + + if (generate_anndata == "True"){ + if(rna_matrix == "None"){ + error "Error: RNA count matrix is not given." + } + generate_adata = "--generate_anndata --read_rna_mtx $rna_matrix" + } + if (generate_mudata == "True"){ + if(rna_matrix == "None"){ + error "Error: RNA count matrix is not given." + } + if(hto_matrix == "None"){ + error "Error: HTO count matrix is not given." + } + generate_mdata = "--generate_mudata --read_rna_mtx $rna_matrix --read_hto_mtx $hto_matrix" + } + + """ + generate_data.py --assignment $assignment $generate_adata $generate_mdata + """ +} + +process summary_all{ + publishDir "$projectDir/$params.outdir/$params.mode", mode: 'copy' + label 'small_mem' + + conda "pandas scanpy mudata" + + input: + path gene_demulti_result + path hash_demulti_result + output: + path "summary" + + script: + """ + summary.py --gene_demulti $gene_demulti_result --hash_demulti $hash_demulti_result + """ +} + + + + +workflow run_single{ + + print("-----Running single sample-----") + + if (params.mode == "genetic"){ + + // Performing genetic demultiplexing methodologies + gene_demultiplexing() + if (params.match_donor == "True"){ + donor_match(gene_demultiplexing.out) + } + } + else if (params.mode == "hashing"){ + + // Performing hashing demultplexing + hash_demultiplexing(params.rna_matrix_raw, params.rna_matrix_filtered, params.hto_matrix_raw, params.hto_matrix_filtered) + if (params.match_donor == "True"){ + donor_match(hash_demultiplexing.out) + } + } + else if (params.mode == "rescue"){ + + // Performing both hashing and genetic demultiplexing methods + hash_demultiplexing(params.rna_matrix_raw, params.rna_matrix_filtered, params.hto_matrix_raw, params.hto_matrix_filtered) + gene_demultiplexing() + gene_summary = gene_demultiplexing.out + hash_summary = hash_demultiplexing.out + summary_all(gene_summary, hash_summary) + + if (params.match_donor == "True"){ + donor_match(summary_all.out) + if (params.generate_anndata == "True" || params.generate_mudata == "True" ){ + generate_data(donor_match.out, params.generate_anndata, params.generate_mudata, + params.rna_matrix_filtered, params.hto_matrix_filtered) + } + } + } + else if (params.mode == "donor_match"){ + + // Performing just donor matching + donor_match(params.demultiplexing_result) + if (params.generate_anndata == "True" || params.generate_mudata == "True" ){ + generate_data(donor_match.out, params.generate_anndata, params.generate_mudata, + params.rna_matrix_filtered, params.hto_matrix_filtered) + } + + } +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index ed91d0a..50d2728 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,5 +1,11 @@ params { + + // Nf core integration + custom_config_version = 'master' + custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" + + outdir = "result" mode = "rescue" generate_anndata = "True" @@ -372,11 +378,11 @@ profiles { executor = 'local' withLabel: big_mem { cpus = 4 - memory = 10.GB + memory = 150.GB } withLabel: small_mem { cpus = 2 - memory = 8.GB + memory = 100.GB } } @@ -389,11 +395,13 @@ profiles { // clusterOptions = ... withLabel: big_mem { cpus = 32 - memory = 64.GB + memory = 150.GB + time =24.h } withLabel: small_mem { cpus = 16 - memory = 32.GB + memory = 100.GB + time =24.h } } } @@ -451,3 +459,10 @@ process { echo = true debug = true } + + +try { + includeConfig "${params.custom_config_base}/nfcore_custom.config" +} catch (Exception e) { + System.err.println("WARNING: Could not load nf-core/config profiles: ${params.custom_config_base}/nfcore_custom.config") +} \ No newline at end of file From 32c6900e709801ba7b7e38e181c15ef907880c65 Mon Sep 17 00:00:00 2001 From: maxozo Date: Thu, 25 Jan 2024 22:50:20 +0000 Subject: [PATCH 02/16] adding subset to speed up runs --- modules/multi/gene_demulti/demuxlet.nf | 27 +++++- modules/multi/gene_demulti/freemuxlet.nf | 28 +++++- modules/multi/gene_demulti/souporcell.nf | 2 +- modules/multi/gene_demultiplexing.nf | 111 +++++++++++------------ modules/single/gene_demultiplexing.nf | 4 + nextflow.config | 1 + 6 files changed, 112 insertions(+), 61 deletions(-) diff --git a/modules/multi/gene_demulti/demuxlet.nf b/modules/multi/gene_demulti/demuxlet.nf index 7faecca..4c8b61a 100755 --- a/modules/multi/gene_demulti/demuxlet.nf +++ b/modules/multi/gene_demulti/demuxlet.nf @@ -2,6 +2,25 @@ nextflow.enable.dsl=2 + +process subset_bam_and_sort_vcf_based_on_reference{ + label 'small_mem' + conda "bioconda::samtools=1.19.2 bedtools bcftools=1.19" + + input: + tuple val(sampleId), path(sam), path(sam_index), path(barcodes), val(vcf) + + output: + tuple val(sampleId), path("${sampleId}_dmx__filtered_bam_file.bam"), path("${sampleId}_dmx__filtered_bam_file.bam.csi"), path(barcodes), path("${sampleId}_dmx__samples.sorted_as_in_bam.vcf"), emit: input + when: + vcf !='None' + script: + """ + filter_bam_file_for_popscle_dsc_pileup.sh ${sam} ${barcodes} ${vcf} ${sampleId}_dmx__filtered_bam_file.bam + sort_vcf_same_as_bam.sh ${sampleId}_dmx__filtered_bam_file.bam ${vcf} > ${sampleId}_dmx__samples.sorted_as_in_bam.vcf + """ +} + process demuxlet { publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti/demuxlet", mode: 'copy' label 'small_mem' @@ -9,7 +28,7 @@ process demuxlet { conda "bioconda::popscle" input: - tuple val(sampleId), path(sam), path(sam_index), path(group_list), path(vcf_donor) + tuple val(sampleId), path(sam), path(sam_index), path(group_list), val(vcf_donor) val tag_group val tag_UMI val sm @@ -45,6 +64,9 @@ process demuxlet { output: path "demuxlet_${sampleId}" + when: + vcf_donor !='None' + script: def samfile = "--sam $sam" def taggroup = tag_group != 'None' ? "--tag-group ${tag_group}" : '' @@ -136,6 +158,9 @@ workflow demultiplex_demuxlet{ doublet_prior = params.doublet_prior demuxlet_out = params.demuxlet_out + + subset_bam_and_sort_vcf_based_on_reference(input_list) + input_list = subset_bam_and_sort_vcf_based_on_reference.out.input demuxlet(input_list, tag_group, tag_UMI, sm, sm_list, sam_verbose, vcf_verbose, skip_umi, cap_BQ, min_BQ, min_MQ, min_TD, excl_flag, min_total, min_uniq, min_umi, min_snp, plp, field, geno_error_offset, geno_error_coeff, r2_info, min_mac, min_callrate, alpha, doublet_prior, demuxlet_out) diff --git a/modules/multi/gene_demulti/freemuxlet.nf b/modules/multi/gene_demulti/freemuxlet.nf index 23390ff..1f5e9f6 100755 --- a/modules/multi/gene_demulti/freemuxlet.nf +++ b/modules/multi/gene_demulti/freemuxlet.nf @@ -1,6 +1,25 @@ #!/usr/bin/env nextflow nextflow.enable.dsl=2 + +process subset_bam_and_sort_vcf_based_on_reference{ + label 'small_mem' + conda "bioconda::samtools=1.19.2 bedtools bcftools=1.19" + + input: + tuple val(sampleId), path(sam), path(sam_index), path(group_list), val(nsample) + path vcf + + output: + tuple val(sampleId), path(sam), path(sam_index), path(group_list), val(nsample), path("${sampleId}__samples.sorted_as_in_bam.vcf"), emit: input + + script: + """ + + sort_vcf_same_as_bam.sh ${sam} ${vcf} > ${sampleId}__samples.sorted_as_in_bam.vcf + """ +} + process freemuxlet { publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti/freemuxlet", mode: 'copy' label 'small_mem' @@ -8,8 +27,7 @@ process freemuxlet { conda "bioconda::popscle" input: - tuple val(sampleId), path(sam), path(sam_index), path(group_list), val(nsample) - path vcf + tuple val(sampleId), path(sam), path(sam_index), path(group_list), val(nsample), path(vcf) val tag_group val tag_UMI val sm @@ -119,7 +137,11 @@ workflow demultiplex_freemuxlet{ keep_init_missing = params.keep_init_missing freemuxlet_out = params.freemuxlet_out - freemuxlet(input_list, vcf, tag_group, tag_UMI, sm, sm_list, sam_verbose, vcf_verbose, skip_umi, cap_BQ, min_BQ, min_MQ, + + + subset_bam_and_sort_vcf_based_on_reference(input_list,vcf) + input_list = subset_bam_and_sort_vcf_based_on_reference.out.input + freemuxlet(input_list, tag_group, tag_UMI, sm, sm_list, sam_verbose, vcf_verbose, skip_umi, cap_BQ, min_BQ, min_MQ, min_TD, excl_flag, min_total, min_uniq, min_umi, min_snp, init_cluster,aux_files, verbose, doublet_prior, bf_thres, frac_init_clust, iter_init, keep_init_missing, freemuxlet_out) diff --git a/modules/multi/gene_demulti/souporcell.nf b/modules/multi/gene_demulti/souporcell.nf index a6f643d..9d6ad0f 100755 --- a/modules/multi/gene_demulti/souporcell.nf +++ b/modules/multi/gene_demulti/souporcell.nf @@ -56,7 +56,7 @@ process souporcell{ touch souporcell_${sampleId}/params.csv echo -e "Argument,Value \n bamfile,${bam} \n barcode,${barcodes} \n fasta,${fasta} \n threads,${threads} \n clusters,${clusters} \n ploidy,${ploidy} \n min_alt,${min_alt} \n min_ref,${min_ref} \n max_loci,${max_loci} \n restarts,${restarts} \n common_variant,${commonvariant_name} \n known_genotype,${knowngenotype_name} \n known_genotype_sample,${knowngenotype_sample_name} \n skip_remap,${skip_remap} \n ignore,${ignore} " >> souporcell_${sampleId}/params.csv - souporcell_pipeline.py $bamfile $barcode $fastafile $thread $cluster $ploi $minalt $minref $maxloci $restart \ + souporcell_pipeline.py --threads ${task.cpus} $bamfile $barcode $fastafile $thread $cluster $ploi $minalt $minref $maxloci $restart \ $commonvariant $knowngenotype $knowngenotypes_sample $skipremap $ign -o $out """ } diff --git a/modules/multi/gene_demultiplexing.nf b/modules/multi/gene_demultiplexing.nf index 80c0ab9..eab527b 100644 --- a/modules/multi/gene_demultiplexing.nf +++ b/modules/multi/gene_demultiplexing.nf @@ -20,6 +20,28 @@ def split_input(input){ } } + +process subset_bam_to_comon_variants{ + + label 'small_mem' + conda "bioconda::samtools=1.19.2 bedtools bcftools=1.19" + + input: + tuple val(sampleId), path(sam), path(sam_index), path(barcodes) + path vcf + + output: + tuple val(sampleId), path("${sampleId}__filtered_bam_file.bam"), path("${sampleId}__filtered_bam_file.bam.csi"), emit: input + + script: + """ + + bcftools sort ${vcf} -Oz -o sorted.vcf.gz + filter_bam_file_for_popscle_dsc_pileup.sh ${sam} ${barcodes} sorted.vcf.gz ${sampleId}__filtered_bam_file.bam + """ + +} + process summary{ publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti", mode: 'copy' label 'small_mem' @@ -103,8 +125,22 @@ workflow gene_demultiplexing { | map { row-> tuple(row.sampleId, row.bam)} | data_preprocess qc_bam = data_preprocess.out.map{ it -> tuple( it.name.tokenize( '_' ).last(), it + "/sorted.bam", it + "/sorted.bam.bai") } + }else{ + qc_bam = Channel.fromPath(params.multi_input) \ + | splitCsv(header:true) \ + | map { row-> tuple(row.sampleId, row.bam, row.bam_index)} + + + } + input_param_cellsnp = Channel.fromPath(params.multi_input) \ + | splitCsv(header:true) \ + | map { row-> tuple(row.sampleId, row.barcodes) } + qc_bam_new = qc_bam.join(input_param_cellsnp) + + qc_bam = subset_bam_to_comon_variants(qc_bam_new,params.common_variants_freemuxlet) + ////////// //FreeBayes/ scSplit @@ -115,29 +151,16 @@ workflow gene_demultiplexing { if (params.region != "None"){ freebayes_region = split_input(params.region) } - if(params.scSplit_preprocess == "True"){ - variant_freebayes(qc_bam, freebayes_region) - } - else{ - input_list = Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.bam, row.bam_index)} - variant_freebayes(input_list, freebayes_region) - } + + variant_freebayes(qc_bam, freebayes_region) filter_variant(variant_freebayes.out) freebayes_vcf = filter_variant.out.map{ it -> tuple(it[0], it[1] + "/filtered_sorted_total_chroms.vcf")} } if (params.scSplit == "True"){ - if (params.scSplit_preprocess == 'False'){ - input_bam_scsplit = Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.bam, row.bam_index)} - } - else{ - input_bam_scsplit = qc_bam - } + + input_bam_scsplit = qc_bam if (params.scSplit_variant == 'True'){ input_vcf_scsplit = freebayes_vcf @@ -148,7 +171,7 @@ workflow gene_demultiplexing { | splitCsv(header:true) \ | map { row-> tuple(row.sampleId, row.vcf_mixed)} } - // here if there are genotypes provided we need to ensure bam is sorted corectly and is subsampled on the regions needed. + input_param_scsplit = Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ | map { row-> tuple(row.sampleId, row.barcodes, row.nsample, row.vcf_donor)} @@ -168,19 +191,7 @@ workflow gene_demultiplexing { ////////// if (params.vireo == "True" & params.vireo_variant == 'True'){ - if(params.vireo_preprocess == 'True'){ - input_param_cellsnp = Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.barcodes) } - qc_bam_new = qc_bam.join(input_param_cellsnp) - variant_cellSNP(qc_bam_new) - } - else{ - Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.bam, row.bam_index, row.barcodes)} - | variant_cellSNP - } + variant_cellSNP(qc_bam_new) cellsnp_vcf = variant_cellSNP.out.map{ it -> tuple( it.name.tokenize( '_' ).last(), it + "/*/cellSNP.cells.vcf") } } @@ -209,18 +220,14 @@ workflow gene_demultiplexing { ////////// - //Demuxlet + // Demuxlet/Freemuxlet + // demuxlet (with genotypes) or freemuxlet (without genotypes) ////////// if (params.demuxlet == "True"){ - if (params.demuxlet_preprocess == 'False'){ - input_bam_demuxlet = Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.bam, row.bam_index)} - } - else{ - input_bam_demuxlet = qc_bam - } + + input_bam_demuxlet = qc_bam + input_param_demuxlet = Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ | map { row-> tuple(row.sampleId, row.barcodes, row.vcf_donor)} @@ -238,18 +245,15 @@ workflow gene_demultiplexing { ////////// if (params.freemuxlet == "True"){ - if (params.freemuxlet_preprocess == 'False'){ - input_bam_freemuxlet = Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.bam, row.bam_index)} - } - else{ - input_bam_freemuxlet = qc_bam - } + + input_bam_freemuxlet = qc_bam + input_param_freemuxlet = Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ | map { row-> tuple(row.sampleId, row.barcodes, row.nsample)} + input_list_freemuxlet = input_bam_freemuxlet.join(input_param_freemuxlet) + demultiplex_freemuxlet(input_list_freemuxlet) freemuxlet_out = demultiplex_freemuxlet.out } @@ -263,14 +267,9 @@ workflow gene_demultiplexing { ////////// if (params.souporcell == "True"){ - if (params.souporcell_preprocess == 'False'){ - input_bam_souporcell = Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.bam, row.bam_index)} - } - else{ - input_bam_souporcell = qc_bam - } + + input_bam_souporcell = qc_bam + input_param_souporcell = Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ | map { row-> tuple(row.sampleId, row.barcodes, row.nsample, row.vcf_donor)} diff --git a/modules/single/gene_demultiplexing.nf b/modules/single/gene_demultiplexing.nf index 20d5bd4..bb6b921 100644 --- a/modules/single/gene_demultiplexing.nf +++ b/modules/single/gene_demultiplexing.nf @@ -129,6 +129,7 @@ workflow gene_demultiplexing { } if (params.demuxlet == "True"){ + // This will be only run if the genotype provided is not None bam = params.demuxlet_preprocess == 'True'? qc_bam: input_bam //qc_bam.mix(input_bam)) demultiplex_demuxlet(bam) demuxlet_out = demultiplex_demuxlet.out @@ -139,7 +140,10 @@ workflow gene_demultiplexing { if (params.freemuxlet == "True"){ + // This will be run regardless if the genotype is provided to pipeline or not since freemuxlet is a genotype absent mode. + bam = params.freemuxlet_preprocess == 'True'? qc_bam: input_bam // qc_bam.mix(input_bam)) + demultiplex_freemuxlet(bam) freemuxlet_out = demultiplex_freemuxlet.out } diff --git a/nextflow.config b/nextflow.config index 50d2728..973fab8 100644 --- a/nextflow.config +++ b/nextflow.config @@ -338,6 +338,7 @@ params { genotype_qualities= "False" debug = "False" dd = "False" + // cellsnp targetsVCF = "None" sampleList = "None" From ddc2e8b45e44610e2c89fe9f06c53fd31457b2c3 Mon Sep 17 00:00:00 2001 From: Matiss Ozols Date: Wed, 31 Jan 2024 20:28:34 +0000 Subject: [PATCH 03/16] mods --- main.nf | 21 +++++------ modules/multi/donor_match.nf | 2 +- modules/multi/gene_demulti/bcftools.nf | 2 +- modules/multi/gene_demulti/cellsnp.nf | 8 ++-- modules/multi/gene_demulti/demuxlet.nf | 4 +- modules/multi/gene_demulti/freebayes.nf | 6 +-- modules/multi/gene_demulti/freemuxlet.nf | 4 +- modules/multi/gene_demulti/samtools.nf | 2 +- modules/multi/gene_demulti/scsplit.nf | 2 +- modules/multi/gene_demulti/souporcell.nf | 10 +++-- modules/multi/gene_demulti/vireo.nf | 6 ++- modules/multi/gene_demultiplexing.nf | 8 ++-- nextflow.config | 14 ++++++- subworkflows/HADGE.nf | 48 ++++++++++++++++++++++++ 14 files changed, 102 insertions(+), 35 deletions(-) create mode 100644 subworkflows/HADGE.nf diff --git a/main.nf b/main.nf index 4bdd4f6..19ce48d 100644 --- a/main.nf +++ b/main.nf @@ -1,17 +1,16 @@ #!/usr/bin/env nextflow nextflow.enable.dsl=2 -include { run_multi } from "$projectDir/modules/multi_demultiplexing" -include {run_single} from "$projectDir/modules/single_demultiplexing" +include { summary } from "$projectDir/modules/multi/gene_demultiplexing" +include { donor_match } from "$projectDir/modules/multi/donor_match" +include { HADGE; SUMMARY } from "$projectDir/subworkflows/HADGE" +// Main entry point in the pipeline workflow { - // Here we decide if it is a single sample demultiplexing or multi input demutliplexing run. - if (params.multi_input == null){ - // Single Mode - run_single() - } - else{ - // Multi mode - run_multi() - } + HADGE() +} + +// Enty point to only generate summary files +workflow SUMMARY_ONLY{ + SUMMARY() } diff --git a/modules/multi/donor_match.nf b/modules/multi/donor_match.nf index 070e2ef..898d70c 100644 --- a/modules/multi/donor_match.nf +++ b/modules/multi/donor_match.nf @@ -4,7 +4,7 @@ nextflow.enable.dsl=2 process matchDonor{ publishDir "$projectDir/$params.outdir/$sampleId/$params.mode", mode: 'copy' label 'big_mem' - + tag "${sampleId}" conda "$projectDir/conda/donor_match.yml" input: diff --git a/modules/multi/gene_demulti/bcftools.nf b/modules/multi/gene_demulti/bcftools.nf index 5de6b67..1a774e6 100644 --- a/modules/multi/gene_demulti/bcftools.nf +++ b/modules/multi/gene_demulti/bcftools.nf @@ -3,7 +3,7 @@ nextflow.enable.dsl=2 process bcftools{ publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti/bcftools", mode: 'copy' label 'big_mem' - + tag "${sampleId}" conda "bioconda::bcftools=1.9" input: diff --git a/modules/multi/gene_demulti/cellsnp.nf b/modules/multi/gene_demulti/cellsnp.nf index fe052c5..2fb276c 100644 --- a/modules/multi/gene_demulti/cellsnp.nf +++ b/modules/multi/gene_demulti/cellsnp.nf @@ -4,7 +4,7 @@ nextflow.enable.dsl=2 process cellSNP{ publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti/cellSNP", mode: 'copy' label 'big_mem' - + tag "${sampleId}" conda "bioconda::cellsnp-lite" input: @@ -33,7 +33,8 @@ process cellSNP{ output: - path "cellsnp_${sampleId}" + path "cellsnp_${sampleId}", emit: out1 + tuple val(sampleId), path("cellsnp_${sampleId}/${cellsnp_out}"), emit: cellsnp_input script: def samFile = "--samFile ${samFile_cellSNP}" @@ -114,5 +115,6 @@ workflow variant_cellSNP{ nproc_cellSNP, refseq_cellSNP, chrom, cellTAG, UMItag, minCOUNT, minMAF, doubletGL, inclFLAG, exclFLAG, minLEN, minMAPQ, countORPHAN, cellsnp_out) emit: - cellSNP.out + out1 = cellSNP.out.out1 + cellsnp_input = cellSNP.out.cellsnp_input } diff --git a/modules/multi/gene_demulti/demuxlet.nf b/modules/multi/gene_demulti/demuxlet.nf index 4c8b61a..44ae4b9 100755 --- a/modules/multi/gene_demulti/demuxlet.nf +++ b/modules/multi/gene_demulti/demuxlet.nf @@ -6,7 +6,7 @@ nextflow.enable.dsl=2 process subset_bam_and_sort_vcf_based_on_reference{ label 'small_mem' conda "bioconda::samtools=1.19.2 bedtools bcftools=1.19" - + tag "${sampleId}" input: tuple val(sampleId), path(sam), path(sam_index), path(barcodes), val(vcf) @@ -24,7 +24,7 @@ process subset_bam_and_sort_vcf_based_on_reference{ process demuxlet { publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti/demuxlet", mode: 'copy' label 'small_mem' - + tag "${sampleId}" conda "bioconda::popscle" input: diff --git a/modules/multi/gene_demulti/freebayes.nf b/modules/multi/gene_demulti/freebayes.nf index fc85d92..1117406 100644 --- a/modules/multi/gene_demulti/freebayes.nf +++ b/modules/multi/gene_demulti/freebayes.nf @@ -4,7 +4,7 @@ nextflow.enable.dsl=2 process freebayes{ publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti/freebayes", mode: 'copy' label 'big_mem' - + tag "${sampleId}" conda "bioconda::freebayes=1.2" input: @@ -85,7 +85,7 @@ process freebayes{ output: - tuple val(sampleId), path ("${sampleId}_${region_freebayes}_${vcf_freebayes}") + tuple val(sampleId), path ("out_${sampleId}_${region_freebayes}_${vcf_freebayes}") script: def stdin = stdin_freebayes != 'False' ? "--stdin" : '' @@ -179,7 +179,7 @@ process freebayes{ ${observation_bias} ${base_quality_cap} ${prob_contamination} ${legacy_gls} ${contamination_estimates} \ ${report_genotype_likelihood_max} ${genotyping_max_iterations} ${genotyping_max_banddepth} ${posterior_integration_limits} ${exclude_unobserved_genotypes} ${genotype_variant_threshold} \ ${use_mapping_quality} ${harmonic_indel_quality} ${read_dependence_factor} ${genotype_qualities} $debug $dd - + ln -s ${sampleId}_${region_freebayes}_${vcf_freebayes} out_${sampleId}_${region_freebayes}_${vcf_freebayes} """ } diff --git a/modules/multi/gene_demulti/freemuxlet.nf b/modules/multi/gene_demulti/freemuxlet.nf index 1f5e9f6..58927cc 100755 --- a/modules/multi/gene_demulti/freemuxlet.nf +++ b/modules/multi/gene_demulti/freemuxlet.nf @@ -5,7 +5,7 @@ nextflow.enable.dsl=2 process subset_bam_and_sort_vcf_based_on_reference{ label 'small_mem' conda "bioconda::samtools=1.19.2 bedtools bcftools=1.19" - + tag "${sampleId}" input: tuple val(sampleId), path(sam), path(sam_index), path(group_list), val(nsample) path vcf @@ -23,7 +23,7 @@ process subset_bam_and_sort_vcf_based_on_reference{ process freemuxlet { publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti/freemuxlet", mode: 'copy' label 'small_mem' - + tag "${sampleId}" conda "bioconda::popscle" input: diff --git a/modules/multi/gene_demulti/samtools.nf b/modules/multi/gene_demulti/samtools.nf index dcf322b..10d3558 100644 --- a/modules/multi/gene_demulti/samtools.nf +++ b/modules/multi/gene_demulti/samtools.nf @@ -4,7 +4,7 @@ nextflow.enable.dsl=2 process samtools{ publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti/samtools", mode: 'copy' label 'big_mem' - + tag "${sampleId}" conda "bioconda::samtools bioconda::umi_tools" input: diff --git a/modules/multi/gene_demulti/scsplit.nf b/modules/multi/gene_demulti/scsplit.nf index b496a44..7aceeb7 100644 --- a/modules/multi/gene_demulti/scsplit.nf +++ b/modules/multi/gene_demulti/scsplit.nf @@ -4,7 +4,7 @@ nextflow.enable.dsl=2 process scSplit{ publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti/scSplit", mode: 'copy' label 'big_mem' - + tag "${sampleId}" conda "$projectDir/conda/scsplit.yml" input: diff --git a/modules/multi/gene_demulti/souporcell.nf b/modules/multi/gene_demulti/souporcell.nf index 9d6ad0f..102487f 100755 --- a/modules/multi/gene_demulti/souporcell.nf +++ b/modules/multi/gene_demulti/souporcell.nf @@ -4,7 +4,7 @@ nextflow.enable.dsl=2 process souporcell{ publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti/souporcell", mode: 'copy' label 'big_mem' - + tag "${sampleId}" container "shub://wheaton5/souporcell" input: @@ -37,10 +37,12 @@ process souporcell{ def minref = "--min_ref ${min_ref}" def maxloci = "--max_loci ${max_loci}" def restart = restarts != 'None' ? "--restarts $restarts" : '' - def commonvariant = (common_variants != 'None' & use_known_genotype != "True" & known_genotypes == 'None' )? "--common_variants ${common_variants}" : '' + def commonvariant = (common_variants != 'None' & use_known_genotype != "True" & known_genotypes == 'None' )? "--common_variants commonvariant.vcf" : '' + def commonvariant_unzip = (common_variants != 'None' & use_known_genotype != "True" & known_genotypes == 'None' )? "bcftools view ${common_variants} -Ov -o commonvariant.vcf" : '' def commonvariant_name = (common_variants != 'None' & use_known_genotype != "True" & known_genotypes == 'None' ) ? common_variants : 'No common variants are given.' - def knowngenotype = (known_genotypes != 'None' & use_known_genotype == "True") ? "--known_genotypes ${known_genotypes}" : '' + def genotype_unzip = (known_genotypes != 'None' & use_known_genotype == "True") ? "bcftools view ${known_genotypes} -Ov -o unzipped.vcf" : '' + def knowngenotype = (known_genotypes != 'None' & use_known_genotype == "True") ? "--known_genotypes unzipped.vcf" : '' def knowngenotype_name = (known_genotypes != 'None' & use_known_genotype == "True") ? known_genotypes : 'No known variants are given.' def knowngenotypes_sample = known_genotypes_sample_names != 'None' ? "--known_genotypes_sample_names ${known_genotypes_sample_names}" : '' @@ -51,6 +53,8 @@ process souporcell{ def out = "souporcell_${sampleId}/${souporcell_out}" """ + ${genotype_unzip} + ${commonvariant_unzip} mkdir souporcell_${sampleId} mkdir $out touch souporcell_${sampleId}/params.csv diff --git a/modules/multi/gene_demulti/vireo.nf b/modules/multi/gene_demulti/vireo.nf index 294b132..3cb43c1 100755 --- a/modules/multi/gene_demulti/vireo.nf +++ b/modules/multi/gene_demulti/vireo.nf @@ -4,7 +4,7 @@ nextflow.enable.dsl=2 process vireo{ publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti/vireo", mode: 'copy' label 'big_mem' - + tag "${sampleId}" conda "aksarkar::vireosnp" input: @@ -34,7 +34,8 @@ process vireo{ def celldata_name = celldata.baseName def n_donor = ndonor != 'None'? "-N $ndonor" : '' def n_donor_yesno = ndonor != 'None'? "$ndonor" : "Number of donors are not given" - def donor = donorfile != 'None' ? "-d $donorfile" : '' + def donor = donorfile != 'None' ? "-d no_prefix_chr.vcf" : '' + def donor_preprocess = donorfile != 'None' ? "bcftools view $donorfile | awk '{gsub(/^chr/,\"\"); print}' | awk '{gsub(/ID=chr/,\"ID=\"); print}' > no_prefix_chr.vcf" : '' def donor_data_name = donorfile != 'None' ? donorfile : 'Donor file is not given' def geno_tag = donorfile != 'None' ? "--genoTag $genoTag" : '' def no_doublet = noDoublet != 'False' ? "--noDoublet" : '' @@ -51,6 +52,7 @@ process vireo{ def n_proc = "--nproc $nproc" """ + ${donor_preprocess} mkdir vireo_${sampleId} mkdir vireo_${sampleId}/${vireo_out} touch vireo_${sampleId}/params.csv diff --git a/modules/multi/gene_demultiplexing.nf b/modules/multi/gene_demultiplexing.nf index eab527b..dad68fa 100644 --- a/modules/multi/gene_demultiplexing.nf +++ b/modules/multi/gene_demultiplexing.nf @@ -25,7 +25,7 @@ process subset_bam_to_comon_variants{ label 'small_mem' conda "bioconda::samtools=1.19.2 bedtools bcftools=1.19" - + tag "${sampleId}" input: tuple val(sampleId), path(sam), path(sam_index), path(barcodes) path vcf @@ -45,7 +45,7 @@ process subset_bam_to_comon_variants{ process summary{ publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti", mode: 'copy' label 'small_mem' - + tag "${sampleId}" conda "pandas scanpy mudata" input: @@ -192,14 +192,14 @@ workflow gene_demultiplexing { if (params.vireo == "True" & params.vireo_variant == 'True'){ variant_cellSNP(qc_bam_new) - cellsnp_vcf = variant_cellSNP.out.map{ it -> tuple( it.name.tokenize( '_' ).last(), it + "/*/cellSNP.cells.vcf") } + cellsnp_vcf = variant_cellSNP.out.out1.map{ it -> tuple( it.name.tokenize( '_' ).last(), it + "/*/cellSNP.cells.vcf") } } if (params.vireo == "True"){ if (params.vireo_variant == 'True'){ - input_vcf_vireo = cellsnp_vcf + input_vcf_vireo = variant_cellSNP.out.cellsnp_input } else{ input_vcf_vireo = Channel.fromPath(params.multi_input) \ diff --git a/nextflow.config b/nextflow.config index 973fab8..ad65771 100644 --- a/nextflow.config +++ b/nextflow.config @@ -6,7 +6,7 @@ params { custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" - outdir = "result" + outdir = "${launchDir}/results" mode = "rescue" generate_anndata = "True" generate_mudata = "False" @@ -404,6 +404,15 @@ profiles { memory = 100.GB time =24.h } + + withName: summary{ + cpus = 6 + memory = 20.GB + } + + withName:souporcell{ + time = 48.h + } } } conda { @@ -459,6 +468,9 @@ profiles { process { echo = true debug = true + maxRetries = 3 + maxErrors = '-1' + errorStrategy = 'retry' } diff --git a/subworkflows/HADGE.nf b/subworkflows/HADGE.nf new file mode 100644 index 0000000..1f32b94 --- /dev/null +++ b/subworkflows/HADGE.nf @@ -0,0 +1,48 @@ +nextflow.enable.dsl=2 + +include { run_multi } from "$projectDir/modules/multi_demultiplexing" +include {run_single} from "$projectDir/modules/single_demultiplexing" +include { summary } from "$projectDir/modules/multi/gene_demultiplexing" +include { donor_match } from "$projectDir/modules/multi/donor_match" + +workflow HADGE { + // Here we decide if it is a single sample demultiplexing or multi input demutliplexing run. + if (params.multi_input == null){ + // Single Mode + run_single() + } + else{ + // Multi mode + run_multi() + } +} + + + +workflow SUMMARY{ + + Channel.fromPath(params.multi_input) \ + | splitCsv(header:true) \ + | map { row-> tuple(row.sampleId, file(row.hto_matrix_filtered), file(row.rna_matrix_filtered))} + | set {input_list_summary} + log.info('running summary only') + + // demuxlet_out = Channel.fromPath('/lustre/scratch125/humgen/teams/hgi/ip13/1k1k/hadge/hedge_multi/hadge_modified/lustre/scratch125/humgen/teams/hgi/ip13/1k1k/hadge/full_run/gt_absent/result/*/genetic/gene_demulti/demuxlet/demuxlet*', type: 'dir').collect() + demuxlet_out = channel.value("no_result") + freemuxlet_out= Channel.fromPath('/lustre/scratch125/humgen/teams/hgi/ip13/1k1k/hadge/hedge_multi/hadge_modified/lustre/scratch125/humgen/teams/hgi/ip13/1k1k/hadge/full_run/gt_absent/result/*/genetic/gene_demulti/freemuxlet/freemuxlet_*', type: 'dir').collect() + vireo_out= Channel.fromPath('/lustre/scratch125/humgen/teams/hgi/ip13/1k1k/hadge/hedge_multi/hadge_modified/lustre/scratch125/humgen/teams/hgi/ip13/1k1k/hadge/full_run/gt_absent/result/*/genetic/gene_demulti/vireo/vireo_*', type: 'dir').collect() + scSplit_out= Channel.fromPath('/lustre/scratch125/humgen/teams/hgi/ip13/1k1k/hadge/hedge_multi/hadge_modified/lustre/scratch125/humgen/teams/hgi/ip13/1k1k/hadge/full_run/gt_absent/result/*/genetic/gene_demulti/scSplit/scsplit*', type: 'dir').collect() + souporcell_out= Channel.fromPath('/lustre/scratch125/humgen/teams/hgi/ip13/1k1k/hadge/hedge_multi/hadge_modified/lustre/scratch125/humgen/teams/hgi/ip13/1k1k/hadge/full_run/gt_absent/result/*/genetic/gene_demulti/souporcell/souporcell_*', type: 'dir').collect() + // /lustre/scratch125/humgen/teams/hgi/ip13/1k1k/hadge/hedge_multi/hadge_modified/lustre/scratch125/humgen/teams/hgi/ip13/1k1k/hadge/full_run/gt_absent/result/pool1/genetic/gene_demulti/souporcell/souporcell_pool1 + // souporcell_out = channel.value("no_result") + summary(input_list_summary, demuxlet_out, freemuxlet_out, vireo_out, souporcell_out, scSplit_out, + params.generate_anndata, params.generate_mudata) + + + Channel.fromPath(params.multi_input) \ + | splitCsv(header:true) \ + | map { row-> tuple(row.sampleId, row.nsample, row.barcodes, "None", "None")} + | join(summary.out) + | donor_match + +} \ No newline at end of file From 883826f6bfc085b71fe5313e8d56d9253d2cfc2d Mon Sep 17 00:00:00 2001 From: Matiss Ozols Date: Thu, 1 Feb 2024 13:15:52 +0000 Subject: [PATCH 04/16] fixing chanel conflicts --- main.nf | 1 - modules/multi/donor_match.nf | 8 ++-- modules/multi/gene_demulti/bcftools.nf | 2 +- modules/multi/gene_demulti/cellsnp.nf | 2 +- modules/multi/gene_demulti/demuxlet.nf | 2 +- modules/multi/gene_demulti/freebayes.nf | 2 +- modules/multi/gene_demulti/freemuxlet.nf | 2 +- modules/multi/gene_demulti/samtools.nf | 2 +- modules/multi/gene_demulti/scsplit.nf | 2 +- modules/multi/gene_demulti/souporcell.nf | 2 +- modules/multi/gene_demulti/vireo.nf | 2 +- modules/multi/gene_demultiplexing.nf | 49 ++++++++++++----------- modules/multi/hash_demulti/bff.nf | 2 +- modules/multi/hash_demulti/demuxem.nf | 2 +- modules/multi/hash_demulti/gmm_demux.nf | 2 +- modules/multi/hash_demulti/hashedDrops.nf | 2 +- modules/multi/hash_demulti/hashsolo.nf | 2 +- modules/multi/hash_demulti/htodemux.nf | 2 +- modules/multi/hash_demulti/multiseq.nf | 2 +- modules/multi/hash_demulti/preprocess.nf | 2 +- modules/multi/hash_demulti/solo.nf | 2 +- modules/multi/hash_demultiplexing.nf | 2 +- modules/multi_demultiplexing.nf | 4 +- modules/single_demultiplexing.nf | 4 +- nextflow.config | 9 ++++- subworkflows/HADGE.nf | 26 +++++++----- 26 files changed, 75 insertions(+), 64 deletions(-) diff --git a/main.nf b/main.nf index 19ce48d..712750b 100644 --- a/main.nf +++ b/main.nf @@ -2,7 +2,6 @@ nextflow.enable.dsl=2 include { summary } from "$projectDir/modules/multi/gene_demultiplexing" -include { donor_match } from "$projectDir/modules/multi/donor_match" include { HADGE; SUMMARY } from "$projectDir/subworkflows/HADGE" // Main entry point in the pipeline diff --git a/modules/multi/donor_match.nf b/modules/multi/donor_match.nf index 898d70c..09f1cfe 100644 --- a/modules/multi/donor_match.nf +++ b/modules/multi/donor_match.nf @@ -2,7 +2,7 @@ nextflow.enable.dsl=2 process matchDonor{ - publishDir "$projectDir/$params.outdir/$sampleId/$params.mode", mode: 'copy' + publishDir "$params.outdir/$sampleId/$params.mode", mode: 'copy' label 'big_mem' tag "${sampleId}" conda "$projectDir/conda/donor_match.yml" @@ -24,12 +24,12 @@ process matchDonor{ def cell_genotype_path = "" if (findVariants == "True" | findVariants == "default"){ cell_genotype_path = cell_genotype != "None" ? "--cell_genotype $cell_genotype" : \ - "--cell_genotype $projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti/cellSNP/cellsnp_1/*/cellSNP.cells.vcf.gz" + "--cell_genotype $params.outdir/$sampleId/$params.mode/gene_demulti/cellSNP/cellsnp_1/*/cellSNP.cells.vcf.gz" } def vireo_parent_path = "" if ( findVariants == 'vireo' | findVariants == 'True' ){ - vireo_parent_path = (params.mode == "donor_match" & vireo_parent_dir != "None") ? "--vireo_parent_dir $vireo_parent_dir" : "--vireo_parent_dir $projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti/vireo/" + vireo_parent_path = (params.mode == "donor_match" & vireo_parent_dir != "None") ? "--vireo_parent_dir $vireo_parent_dir" : "--vireo_parent_dir $params.outdir/$sampleId/$params.mode/gene_demulti/vireo/" } def barcode_whitelist_path = "--barcode $barcode_whitelist" """ @@ -43,7 +43,7 @@ process matchDonor{ if ([ "$findVariants" != "False" ]); then best_method_vireo="\$(head -n 1 \$outputdir/best_method_vireo.txt)" if ([ "$params.mode" != "donor_match" ]); then - donor_genotype="\$(find $projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti/vireo/\$best_method_vireo -name GT_donors.vireo.vcf.gz | head -n 1)" + donor_genotype="\$(find $params.outdir/$sampleId/$params.mode/gene_demulti/vireo/\$best_method_vireo -name GT_donors.vireo.vcf.gz | head -n 1)" else donor_genotype="\$(find $vireo_parent_dir/\$best_method_vireo -name GT_donors.vireo.vcf.gz | head -n 1)" fi diff --git a/modules/multi/gene_demulti/bcftools.nf b/modules/multi/gene_demulti/bcftools.nf index 1a774e6..a1b2663 100644 --- a/modules/multi/gene_demulti/bcftools.nf +++ b/modules/multi/gene_demulti/bcftools.nf @@ -1,7 +1,7 @@ #!/usr/bin/env nextflow nextflow.enable.dsl=2 process bcftools{ - publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti/bcftools", mode: 'copy' + publishDir "$params.outdir/$sampleId/$params.mode/gene_demulti/bcftools", mode: 'copy' label 'big_mem' tag "${sampleId}" conda "bioconda::bcftools=1.9" diff --git a/modules/multi/gene_demulti/cellsnp.nf b/modules/multi/gene_demulti/cellsnp.nf index 2fb276c..973da31 100644 --- a/modules/multi/gene_demulti/cellsnp.nf +++ b/modules/multi/gene_demulti/cellsnp.nf @@ -2,7 +2,7 @@ nextflow.enable.dsl=2 process cellSNP{ - publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti/cellSNP", mode: 'copy' + publishDir "$params.outdir/$sampleId/$params.mode/gene_demulti/cellSNP", mode: 'copy' label 'big_mem' tag "${sampleId}" conda "bioconda::cellsnp-lite" diff --git a/modules/multi/gene_demulti/demuxlet.nf b/modules/multi/gene_demulti/demuxlet.nf index 44ae4b9..0d33fcf 100755 --- a/modules/multi/gene_demulti/demuxlet.nf +++ b/modules/multi/gene_demulti/demuxlet.nf @@ -22,7 +22,7 @@ process subset_bam_and_sort_vcf_based_on_reference{ } process demuxlet { - publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti/demuxlet", mode: 'copy' + publishDir "$params.outdir/$sampleId/$params.mode/gene_demulti/demuxlet", mode: 'copy' label 'small_mem' tag "${sampleId}" conda "bioconda::popscle" diff --git a/modules/multi/gene_demulti/freebayes.nf b/modules/multi/gene_demulti/freebayes.nf index 1117406..ccfb5ed 100644 --- a/modules/multi/gene_demulti/freebayes.nf +++ b/modules/multi/gene_demulti/freebayes.nf @@ -2,7 +2,7 @@ nextflow.enable.dsl=2 process freebayes{ - publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti/freebayes", mode: 'copy' + publishDir "$params.outdir/$sampleId/$params.mode/gene_demulti/freebayes", mode: 'copy' label 'big_mem' tag "${sampleId}" conda "bioconda::freebayes=1.2" diff --git a/modules/multi/gene_demulti/freemuxlet.nf b/modules/multi/gene_demulti/freemuxlet.nf index 58927cc..6d7f12e 100755 --- a/modules/multi/gene_demulti/freemuxlet.nf +++ b/modules/multi/gene_demulti/freemuxlet.nf @@ -21,7 +21,7 @@ process subset_bam_and_sort_vcf_based_on_reference{ } process freemuxlet { - publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti/freemuxlet", mode: 'copy' + publishDir "$params.outdir/$sampleId/$params.mode/gene_demulti/freemuxlet", mode: 'copy' label 'small_mem' tag "${sampleId}" conda "bioconda::popscle" diff --git a/modules/multi/gene_demulti/samtools.nf b/modules/multi/gene_demulti/samtools.nf index 10d3558..af8aea3 100644 --- a/modules/multi/gene_demulti/samtools.nf +++ b/modules/multi/gene_demulti/samtools.nf @@ -2,7 +2,7 @@ nextflow.enable.dsl=2 process samtools{ - publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti/samtools", mode: 'copy' + publishDir "$params.outdir/$sampleId/$params.mode/gene_demulti/samtools", mode: 'copy' label 'big_mem' tag "${sampleId}" conda "bioconda::samtools bioconda::umi_tools" diff --git a/modules/multi/gene_demulti/scsplit.nf b/modules/multi/gene_demulti/scsplit.nf index 7aceeb7..de7dab9 100644 --- a/modules/multi/gene_demulti/scsplit.nf +++ b/modules/multi/gene_demulti/scsplit.nf @@ -2,7 +2,7 @@ nextflow.enable.dsl=2 process scSplit{ - publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti/scSplit", mode: 'copy' + publishDir "$params.outdir/$sampleId/$params.mode/gene_demulti/scSplit", mode: 'copy' label 'big_mem' tag "${sampleId}" conda "$projectDir/conda/scsplit.yml" diff --git a/modules/multi/gene_demulti/souporcell.nf b/modules/multi/gene_demulti/souporcell.nf index 102487f..19af807 100755 --- a/modules/multi/gene_demulti/souporcell.nf +++ b/modules/multi/gene_demulti/souporcell.nf @@ -2,7 +2,7 @@ nextflow.enable.dsl=2 process souporcell{ - publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti/souporcell", mode: 'copy' + publishDir "$params.outdir/$sampleId/$params.mode/gene_demulti/souporcell", mode: 'copy' label 'big_mem' tag "${sampleId}" container "shub://wheaton5/souporcell" diff --git a/modules/multi/gene_demulti/vireo.nf b/modules/multi/gene_demulti/vireo.nf index 3cb43c1..d840ff1 100755 --- a/modules/multi/gene_demulti/vireo.nf +++ b/modules/multi/gene_demulti/vireo.nf @@ -2,7 +2,7 @@ nextflow.enable.dsl=2 process vireo{ - publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti/vireo", mode: 'copy' + publishDir "$params.outdir/$sampleId/$params.mode/gene_demulti/vireo", mode: 'copy' label 'big_mem' tag "${sampleId}" conda "aksarkar::vireosnp" diff --git a/modules/multi/gene_demultiplexing.nf b/modules/multi/gene_demultiplexing.nf index dad68fa..ab71add 100644 --- a/modules/multi/gene_demultiplexing.nf +++ b/modules/multi/gene_demultiplexing.nf @@ -43,21 +43,17 @@ process subset_bam_to_comon_variants{ } process summary{ - publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/gene_demulti", mode: 'copy' + publishDir "$params.outdir/$sampleId/$params.mode/gene_demulti", mode: 'copy' label 'small_mem' tag "${sampleId}" conda "pandas scanpy mudata" input: - tuple val(sampleId), path(hto_matrix, stageAs: 'hto_data'), path(rna_matrix, stageAs: 'rna_data') - val demuxlet_result - val freemuxlet_result - val vireo_result - val souporcell_result - val scsplit_result + tuple(val(sampleId), path(hto_matrix, stageAs: 'hto_data'), path(rna_matrix, stageAs: 'rna_data'), val(souporcell_result), val(scsplit_result), val(vireo_result),val(freemuxlet_result),val(demuxlet_result)) val generate_anndata val generate_mudata - + + output: tuple val(sampleId), path("genetic_summary") @@ -70,25 +66,20 @@ process summary{ def generate_adata = "" def generate_mdata = "" - if (demuxlet_result != "no_result"){ - demuxlet_res = demuxlet_result.find{it.name.contains(sampleId)} - demuxlet_files = "--demuxlet ${demuxlet_res}" + if (demuxlet_result){ + demuxlet_files = "--demuxlet ${demuxlet_result}" } - if (freemuxlet_result != "no_result"){ - freemuxlet_res = freemuxlet_result.find{it.name.contains(sampleId)} - freemuxlet_files = "--freemuxlet ${freemuxlet_res}" + if (freemuxlet_result){ + freemuxlet_files = "--freemuxlet ${freemuxlet_result}" } - if (vireo_result != "no_result"){ - vireo_res = vireo_result.find{it.name.contains(sampleId)} - vireo_files = "--vireo ${vireo_res}" + if (vireo_result){ + vireo_files = "--vireo ${vireo_result}" } - if (souporcell_result != "no_result"){ - souporcell_res = souporcell_result.find{it.name.contains(sampleId)} - souporcell_files = "--souporcell ${souporcell_res}" + if (souporcell_result){ + souporcell_files = "--souporcell ${souporcell_result}" } - if (scsplit_result != "no_result"){ - scsplit_res = scsplit_result.find{it.name.contains(sampleId)} - scsplit_files = "--scsplit ${scsplit_res}" + if (scsplit_result){ + scsplit_files = "--scsplit ${scsplit_result}" } if (generate_anndata == "True"){ if(rna_matrix.name == "None"){ @@ -291,8 +282,18 @@ workflow gene_demultiplexing { | map { row-> tuple(row.sampleId, file(row.hto_matrix_filtered), file(row.rna_matrix_filtered))} | set {input_list_summary} - summary(input_list_summary, demuxlet_out, freemuxlet_out, vireo_out, souporcell_out, scSplit_out, + demuxlet_out_ch = demuxlet_out.flatten().map{r1-> tuple( "$r1".replaceAll(".*demuxlet_",""), r1 )} + freemuxlet_out_ch = freemuxlet_out.flatten().map{r1-> tuple( "$r1".replaceAll(".*freemuxlet_",""), r1 )} + vireo_out_ch = vireo_out.flatten().map{r1-> tuple( "$r1".replaceAll(".*vireo_",""), r1 )} + scSplit_out_ch = scSplit_out.flatten().map{r1-> tuple( "$r1".replaceAll(".*scsplit_",""), r1 )} + souporcell_out_ch = souporcell_out.flatten().map{r1-> tuple( "$r1".replaceAll(".*souporcell_",""), r1 )} + + summary_input = input_list_summary.join(souporcell_out_ch,by:0,remainder: true).join(scSplit_out_ch,by:0,remainder: true).join(vireo_out_ch,by:0,remainder: true).join(freemuxlet_out_ch,by:0,remainder: true).join(demuxlet_out_ch,by:0,remainder: true) + summary_input = summary_input.filter{ it[0] != 'no_result' } + + summary(summary_input, params.generate_anndata, params.generate_mudata) + emit: diff --git a/modules/multi/hash_demulti/bff.nf b/modules/multi/hash_demulti/bff.nf index 3cc1e1b..4546258 100644 --- a/modules/multi/hash_demulti/bff.nf +++ b/modules/multi/hash_demulti/bff.nf @@ -2,7 +2,7 @@ nextflow.enable.dsl=2 process bff{ - publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/hash_demulti/bff", mode:'copy' + publishDir "$params.outdir/$sampleId/$params.mode/hash_demulti/bff", mode:'copy' label 'small_mem' conda "$projectDir/conda/bff.yml" diff --git a/modules/multi/hash_demulti/demuxem.nf b/modules/multi/hash_demulti/demuxem.nf index 71493e7..1b542f1 100644 --- a/modules/multi/hash_demulti/demuxem.nf +++ b/modules/multi/hash_demulti/demuxem.nf @@ -2,7 +2,7 @@ nextflow.enable.dsl=2 process demuxem{ - publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/hash_demulti/demuxem", mode:'copy' + publishDir "$params.outdir/$sampleId/$params.mode/hash_demulti/demuxem", mode:'copy' label 'small_mem' conda "bioconda::pegasuspy bioconda::scanpy bioconda::demuxEM" diff --git a/modules/multi/hash_demulti/gmm_demux.nf b/modules/multi/hash_demulti/gmm_demux.nf index bb98533..03d009c 100644 --- a/modules/multi/hash_demulti/gmm_demux.nf +++ b/modules/multi/hash_demulti/gmm_demux.nf @@ -2,7 +2,7 @@ nextflow.enable.dsl=2 process gmm_demux{ - publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/hash_demulti/gmm_demux", mode:'copy' + publishDir "$params.outdir/$sampleId/$params.mode/hash_demulti/gmm_demux", mode:'copy' label 'small_mem' conda "$projectDir/conda/gmm_demux.yml" diff --git a/modules/multi/hash_demulti/hashedDrops.nf b/modules/multi/hash_demulti/hashedDrops.nf index 9c5b388..dd19f4c 100755 --- a/modules/multi/hash_demulti/hashedDrops.nf +++ b/modules/multi/hash_demulti/hashedDrops.nf @@ -2,7 +2,7 @@ nextflow.enable.dsl=2 process hashedDrops{ - publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/hash_demulti/hashedDrops", mode:'copy' + publishDir "$params.outdir/$sampleId/$params.mode/hash_demulti/hashedDrops", mode:'copy' label 'small_mem' conda "conda-forge::r-seurat conda-forge::r-argparse bioconda::bioconductor-dropletutils" diff --git a/modules/multi/hash_demulti/hashsolo.nf b/modules/multi/hash_demulti/hashsolo.nf index 8b2328b..99e11a2 100755 --- a/modules/multi/hash_demulti/hashsolo.nf +++ b/modules/multi/hash_demulti/hashsolo.nf @@ -1,7 +1,7 @@ #!/usr/bin/env nextflow nextflow.enable.dsl=2 process hash_solo{ - publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/hash_demulti/hashsolo", mode:'copy' + publishDir "$params.outdir/$sampleId/$params.mode/hash_demulti/hashsolo", mode:'copy' label 'small_mem' conda "$projectDir/conda/hashsolo_py.yml" diff --git a/modules/multi/hash_demulti/htodemux.nf b/modules/multi/hash_demulti/htodemux.nf index bf18a55..f2d8a55 100644 --- a/modules/multi/hash_demulti/htodemux.nf +++ b/modules/multi/hash_demulti/htodemux.nf @@ -2,7 +2,7 @@ nextflow.enable.dsl=2 process htodemux{ - publishDir "$projectDir/$params.outdir/${seurat_object.name.tokenize( '_' )[1]}/$params.mode/hash_demulti/htodemux", mode: 'copy' + publishDir "$params.outdir/${seurat_object.name.tokenize( '_' )[1]}/$params.mode/hash_demulti/htodemux", mode: 'copy' label 'small_mem' conda "conda-forge::r-seurat conda-forge::r-argparse" diff --git a/modules/multi/hash_demulti/multiseq.nf b/modules/multi/hash_demulti/multiseq.nf index db9c58b..cdc926c 100644 --- a/modules/multi/hash_demulti/multiseq.nf +++ b/modules/multi/hash_demulti/multiseq.nf @@ -2,7 +2,7 @@ nextflow.enable.dsl=2 process multi_seq{ - publishDir "$projectDir/$params.outdir/${seurat_object.name.tokenize( '_' )[1]}/$params.mode/hash_demulti/multiseq", mode: 'copy' + publishDir "$params.outdir/${seurat_object.name.tokenize( '_' )[1]}/$params.mode/hash_demulti/multiseq", mode: 'copy' label 'small_mem' conda "conda-forge::r-seurat conda-forge::r-argparse" diff --git a/modules/multi/hash_demulti/preprocess.nf b/modules/multi/hash_demulti/preprocess.nf index 752d327..4bf01ee 100644 --- a/modules/multi/hash_demulti/preprocess.nf +++ b/modules/multi/hash_demulti/preprocess.nf @@ -2,7 +2,7 @@ nextflow.enable.dsl=2 process preprocess{ - publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/hash_demulti/preprocess", mode:'copy' + publishDir "$params.outdir/$sampleId/$params.mode/hash_demulti/preprocess", mode:'copy' label 'small_mem' conda "conda-forge::r-seurat conda-forge::r-argparse" diff --git a/modules/multi/hash_demulti/solo.nf b/modules/multi/hash_demulti/solo.nf index 9fba924..4c9061a 100755 --- a/modules/multi/hash_demulti/solo.nf +++ b/modules/multi/hash_demulti/solo.nf @@ -2,7 +2,7 @@ nextflow.enable.dsl=2 process solo{ - publishDir "$projectDir/$params.outdir/$params.mode/hash_demulti/solo", mode:'copy' + publishDir "$params.outdir/$params.mode/hash_demulti/solo", mode:'copy' label 'small_mem' input: diff --git a/modules/multi/hash_demultiplexing.nf b/modules/multi/hash_demultiplexing.nf index 82331fb..06e3e1b 100644 --- a/modules/multi/hash_demultiplexing.nf +++ b/modules/multi/hash_demultiplexing.nf @@ -11,7 +11,7 @@ include { gmm_demux_hashing } from './hash_demulti/gmm_demux' include { bff_hashing } from './hash_demulti/bff' process summary{ - publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/hash_demulti", mode: 'copy' + publishDir "$params.outdir/$sampleId/$params.mode/hash_demulti", mode: 'copy' label 'small_mem' conda "pandas scanpy mudata" diff --git a/modules/multi_demultiplexing.nf b/modules/multi_demultiplexing.nf index 8f9f954..6dc6010 100644 --- a/modules/multi_demultiplexing.nf +++ b/modules/multi_demultiplexing.nf @@ -7,7 +7,7 @@ include { donor_match } from "$projectDir/modules/multi/donor_match" process generate_data{ - publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/data_output", mode: 'copy' + publishDir "$params.outdir/$sampleId/$params.mode/data_output", mode: 'copy' label 'small_mem' conda "pandas scanpy mudata" @@ -47,7 +47,7 @@ process generate_data{ } process summary_all{ - publishDir "$projectDir/$params.outdir/$sampleId/$params.mode", mode: 'copy' + publishDir "$params.outdir/$sampleId/$params.mode", mode: 'copy' label 'small_mem' conda "pandas scanpy mudata" diff --git a/modules/single_demultiplexing.nf b/modules/single_demultiplexing.nf index 16959e5..2f2a871 100644 --- a/modules/single_demultiplexing.nf +++ b/modules/single_demultiplexing.nf @@ -7,7 +7,7 @@ include { donor_match } from "$projectDir/modules/single/donor_match" process generate_data{ - publishDir "$projectDir/$params.outdir/$params.mode/data_output", mode: 'copy' + publishDir "$params.outdir/$params.mode/data_output", mode: 'copy' conda "pandas scanpy mudata" @@ -48,7 +48,7 @@ process generate_data{ } process summary_all{ - publishDir "$projectDir/$params.outdir/$params.mode", mode: 'copy' + publishDir "$params.outdir/$params.mode", mode: 'copy' label 'small_mem' conda "pandas scanpy mudata" diff --git a/nextflow.config b/nextflow.config index ad65771..78e3a0f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -406,8 +406,13 @@ profiles { } withName: summary{ - cpus = 6 - memory = 20.GB + cpus = 1 + memory = {10.GB * task.attempt} + } + + withName: matchDonor{ + cpus = {2 * task.attempt} + memory = {10.GB * task.attempt } } withName:souporcell{ diff --git a/subworkflows/HADGE.nf b/subworkflows/HADGE.nf index 1f32b94..5437a4b 100644 --- a/subworkflows/HADGE.nf +++ b/subworkflows/HADGE.nf @@ -27,18 +27,24 @@ workflow SUMMARY{ | set {input_list_summary} log.info('running summary only') - // demuxlet_out = Channel.fromPath('/lustre/scratch125/humgen/teams/hgi/ip13/1k1k/hadge/hedge_multi/hadge_modified/lustre/scratch125/humgen/teams/hgi/ip13/1k1k/hadge/full_run/gt_absent/result/*/genetic/gene_demulti/demuxlet/demuxlet*', type: 'dir').collect() - demuxlet_out = channel.value("no_result") - freemuxlet_out= Channel.fromPath('/lustre/scratch125/humgen/teams/hgi/ip13/1k1k/hadge/hedge_multi/hadge_modified/lustre/scratch125/humgen/teams/hgi/ip13/1k1k/hadge/full_run/gt_absent/result/*/genetic/gene_demulti/freemuxlet/freemuxlet_*', type: 'dir').collect() - vireo_out= Channel.fromPath('/lustre/scratch125/humgen/teams/hgi/ip13/1k1k/hadge/hedge_multi/hadge_modified/lustre/scratch125/humgen/teams/hgi/ip13/1k1k/hadge/full_run/gt_absent/result/*/genetic/gene_demulti/vireo/vireo_*', type: 'dir').collect() - scSplit_out= Channel.fromPath('/lustre/scratch125/humgen/teams/hgi/ip13/1k1k/hadge/hedge_multi/hadge_modified/lustre/scratch125/humgen/teams/hgi/ip13/1k1k/hadge/full_run/gt_absent/result/*/genetic/gene_demulti/scSplit/scsplit*', type: 'dir').collect() - souporcell_out= Channel.fromPath('/lustre/scratch125/humgen/teams/hgi/ip13/1k1k/hadge/hedge_multi/hadge_modified/lustre/scratch125/humgen/teams/hgi/ip13/1k1k/hadge/full_run/gt_absent/result/*/genetic/gene_demulti/souporcell/souporcell_*', type: 'dir').collect() - // /lustre/scratch125/humgen/teams/hgi/ip13/1k1k/hadge/hedge_multi/hadge_modified/lustre/scratch125/humgen/teams/hgi/ip13/1k1k/hadge/full_run/gt_absent/result/pool1/genetic/gene_demulti/souporcell/souporcell_pool1 - // souporcell_out = channel.value("no_result") - summary(input_list_summary, demuxlet_out, freemuxlet_out, vireo_out, souporcell_out, scSplit_out, + demuxlet_out = Channel.fromPath("${params.outdir}/*/genetic/gene_demulti/demuxlet/demuxlet_*", type: 'dir').collect().ifEmpty('no_result') + freemuxlet_out= Channel.fromPath("${params.outdir}/*/genetic/gene_demulti/freemuxlet/freemuxlet_*", type: 'dir').collect().ifEmpty('no_result') + vireo_out= Channel.fromPath("${params.outdir}/*/genetic/gene_demulti/vireo/vireo_*", type: 'dir').collect().ifEmpty('no_result') + scSplit_out= Channel.fromPath("${params.outdir}/*/genetic/gene_demulti/scSplit/scsplit*", type: 'dir').collect().ifEmpty('no_result') + souporcell_out= Channel.fromPath("${params.outdir}/*/genetic/gene_demulti/souporcell/souporcell_*", type: 'dir').collect().ifEmpty('no_result') + + demuxlet_out_ch = demuxlet_out.flatten().map{r1-> tuple( "$r1".replaceAll(".*demuxlet_",""), r1 )} + freemuxlet_out_ch = freemuxlet_out.flatten().map{r1-> tuple( "$r1".replaceAll(".*freemuxlet_",""), r1 )} + vireo_out_ch = vireo_out.flatten().map{r1-> tuple( "$r1".replaceAll(".*vireo_",""), r1 )} + scSplit_out_ch = scSplit_out.flatten().map{r1-> tuple( "$r1".replaceAll(".*scsplit_",""), r1 )} + souporcell_out_ch = souporcell_out.flatten().map{r1-> tuple( "$r1".replaceAll(".*souporcell_",""), r1 )} + + summary_input = input_list_summary.join(souporcell_out_ch,by:0,remainder: true).join(scSplit_out_ch,by:0,remainder: true).join(vireo_out_ch,by:0,remainder: true).join(freemuxlet_out_ch,by:0,remainder: true).join(demuxlet_out_ch,by:0,remainder: true) + summary_input = summary_input.filter{ it[0] != 'no_result' } + + summary(summary_input, params.generate_anndata, params.generate_mudata) - Channel.fromPath(params.multi_input) \ | splitCsv(header:true) \ | map { row-> tuple(row.sampleId, row.nsample, row.barcodes, "None", "None")} From 8f82b36ca557109e2a0c35e9f5d77f7f9efca11d Mon Sep 17 00:00:00 2001 From: Matiss Ozols Date: Thu, 1 Feb 2024 17:55:34 +0000 Subject: [PATCH 05/16] removing redundant single process --- main.nf | 2 +- modules/multi/gene_demultiplexing.nf | 292 +++++++++---------- modules/multi/hash_demultiplexing.nf | 194 ++++++------ modules/multi/preprocessing/preprocessing.nf | 34 +++ modules/multi_demultiplexing.nf | 116 +++----- nextflow.config | 20 +- subworkflows/HADGE.nf | 45 ++- test_data/download_data.sh | 2 +- test_data/hto/barcodes.tsv | 0 test_data/hto/genes.tsv | 0 test_data/hto/matrix.mtx | 0 test_data/multi_sample_input.csv | 0 test_data/rna/barcodes.tsv | 0 test_data/rna/genes.tsv | 0 test_data/rna/matrix.mtx | 0 test_data/simulation.R | 0 16 files changed, 372 insertions(+), 333 deletions(-) create mode 100644 modules/multi/preprocessing/preprocessing.nf mode change 100644 => 100755 test_data/download_data.sh mode change 100644 => 100755 test_data/hto/barcodes.tsv mode change 100644 => 100755 test_data/hto/genes.tsv mode change 100644 => 100755 test_data/hto/matrix.mtx mode change 100644 => 100755 test_data/multi_sample_input.csv mode change 100644 => 100755 test_data/rna/barcodes.tsv mode change 100644 => 100755 test_data/rna/genes.tsv mode change 100644 => 100755 test_data/rna/matrix.mtx mode change 100644 => 100755 test_data/simulation.R diff --git a/main.nf b/main.nf index 712750b..77aab97 100644 --- a/main.nf +++ b/main.nf @@ -10,6 +10,6 @@ workflow { } // Enty point to only generate summary files -workflow SUMMARY_ONLY{ +workflow HADGE_SUMMARY_ONLY{ SUMMARY() } diff --git a/modules/multi/gene_demultiplexing.nf b/modules/multi/gene_demultiplexing.nf index ab71add..2567e7f 100644 --- a/modules/multi/gene_demultiplexing.nf +++ b/modules/multi/gene_demultiplexing.nf @@ -97,205 +97,205 @@ process summary{ generate_mdata = "--generate_mudata --read_rna_mtx rna_data --read_hto_mtx hto_data" } """ - summary_gene.py $demuxlet_files $vireo_files $souporcell_files $scsplit_files $freemuxlet_files $generate_adata $generate_mdata + summary_gene.py $demuxlet_files $vireo_files $souporcell_files $scsplit_files $freemuxlet_files $generate_adata $generate_mdata """ } workflow gene_demultiplexing { + take: + input_channel + main: + + if ((params.demuxlet == "True" & params.demuxlet_preprocess == "True") | \ + (params.freemuxlet == "True" & params.freemuxlet_preprocess == "True") | \ + (params.scSplit == "True" & params.scSplit_preprocess == "True") | \ + (params.vireo == "True" & params.vireo_preprocess == "True") | \ + (params.souporcell == "True" & params.souporcell_preprocess == "True")) { - if ((params.demuxlet == "True" & params.demuxlet_preprocess == "True") | \ - (params.freemuxlet == "True" & params.freemuxlet_preprocess == "True") | \ - (params.scSplit == "True" & params.scSplit_preprocess == "True") | \ - (params.vireo == "True" & params.vireo_preprocess == "True") | \ - (params.souporcell == "True" & params.souporcell_preprocess == "True")) { + input_channel \ + | splitCsv(header:true) \ + | map { row-> tuple(row.sampleId, row.bam)} + | data_preprocess + qc_bam = data_preprocess.out.map{ it -> tuple( it.name.tokenize( '_' ).last(), it + "/sorted.bam", it + "/sorted.bam.bai") } + }else{ + qc_bam = input_channel \ + | splitCsv(header:true) \ + | map { row-> tuple(row.sampleId, row.bam, row.bam_index)} - Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.bam)} - | data_preprocess - qc_bam = data_preprocess.out.map{ it -> tuple( it.name.tokenize( '_' ).last(), it + "/sorted.bam", it + "/sorted.bam.bai") } - }else{ - qc_bam = Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.bam, row.bam_index)} + } - } + input_param_cellsnp = input_channel \ + | splitCsv(header:true) \ + | map { row-> tuple(row.sampleId, row.barcodes) } + qc_bam_new = qc_bam.join(input_param_cellsnp) - input_param_cellsnp = Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.barcodes) } - qc_bam_new = qc_bam.join(input_param_cellsnp) - qc_bam = subset_bam_to_comon_variants(qc_bam_new,params.common_variants_freemuxlet) + if (params.subset_bam_to_comon_variants){ + qc_bam = subset_bam_to_comon_variants(qc_bam_new,params.common_variants_freemuxlet) + } + ////////// + //FreeBayes/ scSplit + ////////// + if (params.scSplit == "True" & params.scSplit_variant == 'True'){ - ////////// - //FreeBayes/ scSplit - ////////// - if (params.scSplit == "True" & params.scSplit_variant == 'True'){ + freebayes_region = Channel.from(1..22, "X","Y").flatten() + if (params.region != "None"){ + freebayes_region = split_input(params.region) + } - freebayes_region = Channel.from(1..22, "X","Y").flatten() - if (params.region != "None"){ - freebayes_region = split_input(params.region) + variant_freebayes(qc_bam, freebayes_region) + filter_variant(variant_freebayes.out) + freebayes_vcf = filter_variant.out.map{ it -> tuple(it[0], it[1] + "/filtered_sorted_total_chroms.vcf")} } - variant_freebayes(qc_bam, freebayes_region) - filter_variant(variant_freebayes.out) - freebayes_vcf = filter_variant.out.map{ it -> tuple(it[0], it[1] + "/filtered_sorted_total_chroms.vcf")} - } + if (params.scSplit == "True"){ - if (params.scSplit == "True"){ + input_bam_scsplit = qc_bam - input_bam_scsplit = qc_bam + if (params.scSplit_variant == 'True'){ + input_vcf_scsplit = freebayes_vcf + } + else{ + + input_vcf_scsplit = input_channel \ + | splitCsv(header:true) \ + | map { row-> tuple(row.sampleId, row.vcf_mixed)} + } - if (params.scSplit_variant == 'True'){ - input_vcf_scsplit = freebayes_vcf + input_param_scsplit = input_channel \ + | splitCsv(header:true) \ + | map { row-> tuple(row.sampleId, row.barcodes, row.nsample, row.vcf_donor)} + + input_list_scsplit = input_bam_scsplit.join(input_vcf_scsplit) + input_list_scsplit = input_list_scsplit.join(input_param_scsplit) + demultiplex_scSplit(input_list_scsplit) + scSplit_out = demultiplex_scSplit.out } else{ - - input_vcf_scsplit = Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.vcf_mixed)} + scSplit_out = channel.value("no_result") } - input_param_scsplit = Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.barcodes, row.nsample, row.vcf_donor)} - - input_list_scsplit = input_bam_scsplit.join(input_vcf_scsplit) - input_list_scsplit = input_list_scsplit.join(input_param_scsplit) - demultiplex_scSplit(input_list_scsplit) - scSplit_out = demultiplex_scSplit.out - } - else{ - scSplit_out = channel.value("no_result") - } - - ////////// - //CellSNP/Vireo - ////////// - if (params.vireo == "True" & params.vireo_variant == 'True'){ + ////////// + //CellSNP/Vireo + ////////// + if (params.vireo == "True" & params.vireo_variant == 'True'){ - variant_cellSNP(qc_bam_new) - cellsnp_vcf = variant_cellSNP.out.out1.map{ it -> tuple( it.name.tokenize( '_' ).last(), it + "/*/cellSNP.cells.vcf") } + variant_cellSNP(qc_bam_new) + cellsnp_vcf = variant_cellSNP.out.out1.map{ it -> tuple( it.name.tokenize( '_' ).last(), it + "/*/cellSNP.cells.vcf") } - } + } - if (params.vireo == "True"){ + if (params.vireo == "True"){ - if (params.vireo_variant == 'True'){ - input_vcf_vireo = variant_cellSNP.out.cellsnp_input + if (params.vireo_variant == 'True'){ + input_vcf_vireo = variant_cellSNP.out.cellsnp_input + } + else{ + input_vcf_vireo = input_channel \ + | splitCsv(header:true) \ + | map { row-> tuple(row.sampleId, row.celldata)} + } + input_param_vireo = input_channel \ + | splitCsv(header:true) \ + | map { row-> tuple(row.sampleId, row.nsample, row.vcf_donor)} + + input_list_vireo = input_vcf_vireo.join(input_param_vireo) + demultiplex_vireo(input_list_vireo) + vireo_out = demultiplex_vireo.out } else{ - input_vcf_vireo = Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.celldata)} + vireo_out = channel.value("no_result") } - input_param_vireo = Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.nsample, row.vcf_donor)} - - input_list_vireo = input_vcf_vireo.join(input_param_vireo) - demultiplex_vireo(input_list_vireo) - vireo_out = demultiplex_vireo.out - } - else{ - vireo_out = channel.value("no_result") - } - ////////// - // Demuxlet/Freemuxlet - // demuxlet (with genotypes) or freemuxlet (without genotypes) - ////////// + ////////// + // Demuxlet/Freemuxlet + // demuxlet (with genotypes) or freemuxlet (without genotypes) + ////////// - if (params.demuxlet == "True"){ + if (params.demuxlet == "True"){ - input_bam_demuxlet = qc_bam + input_bam_demuxlet = qc_bam - input_param_demuxlet = Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.barcodes, row.vcf_donor)} - input_list_demuxlet = input_bam_demuxlet.join(input_param_demuxlet) - demultiplex_demuxlet(input_list_demuxlet) - demuxlet_out = demultiplex_demuxlet.out - } - else{ - demuxlet_out = channel.value("no_result") - } + input_param_demuxlet = input_channel \ + | splitCsv(header:true) \ + | map { row-> tuple(row.sampleId, row.barcodes, row.vcf_donor)} + input_list_demuxlet = input_bam_demuxlet.join(input_param_demuxlet) + demultiplex_demuxlet(input_list_demuxlet) + demuxlet_out = demultiplex_demuxlet.out + } + else{ + demuxlet_out = channel.value("no_result") + } - ////////// - //Freemuxlet - ////////// + ////////// + //Freemuxlet + ////////// - if (params.freemuxlet == "True"){ + if (params.freemuxlet == "True"){ - input_bam_freemuxlet = qc_bam - - input_param_freemuxlet = Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.barcodes, row.nsample)} + input_bam_freemuxlet = qc_bam + + input_param_freemuxlet = input_channel \ + | splitCsv(header:true) \ + | map { row-> tuple(row.sampleId, row.barcodes, row.nsample)} - input_list_freemuxlet = input_bam_freemuxlet.join(input_param_freemuxlet) + input_list_freemuxlet = input_bam_freemuxlet.join(input_param_freemuxlet) - demultiplex_freemuxlet(input_list_freemuxlet) - freemuxlet_out = demultiplex_freemuxlet.out - } - else{ - freemuxlet_out = channel.value("no_result") - } + demultiplex_freemuxlet(input_list_freemuxlet) + freemuxlet_out = demultiplex_freemuxlet.out + } + else{ + freemuxlet_out = channel.value("no_result") + } - ////////// - //Souporcell - ////////// + ////////// + //Souporcell + ////////// - if (params.souporcell == "True"){ - - input_bam_souporcell = qc_bam - - input_param_souporcell = Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.barcodes, row.nsample, row.vcf_donor)} - - input_list_souporcell = input_bam_souporcell.join(input_param_souporcell) - demultiplex_souporcell(input_list_souporcell) - souporcell_out = demultiplex_souporcell.out - } - else{ - souporcell_out = channel.value("no_result") - } + if (params.souporcell == "True"){ + + input_bam_souporcell = qc_bam - ////////// - //Summary - ////////// - - Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, file(row.hto_matrix_filtered), file(row.rna_matrix_filtered))} - | set {input_list_summary} + input_param_souporcell = input_channel \ + | splitCsv(header:true) \ + | map { row-> tuple(row.sampleId, row.barcodes, row.nsample, row.vcf_donor)} + + input_list_souporcell = input_bam_souporcell.join(input_param_souporcell) + demultiplex_souporcell(input_list_souporcell) + souporcell_out = demultiplex_souporcell.out + } + else{ + souporcell_out = channel.value("no_result") + } + + ////////// + //Summary + ////////// + + input_list_summary = input_channel.splitCsv(header:true).map { row-> tuple(row.sampleId, file(row.hto_matrix_filtered), file(row.rna_matrix_filtered))} - demuxlet_out_ch = demuxlet_out.flatten().map{r1-> tuple( "$r1".replaceAll(".*demuxlet_",""), r1 )} - freemuxlet_out_ch = freemuxlet_out.flatten().map{r1-> tuple( "$r1".replaceAll(".*freemuxlet_",""), r1 )} - vireo_out_ch = vireo_out.flatten().map{r1-> tuple( "$r1".replaceAll(".*vireo_",""), r1 )} - scSplit_out_ch = scSplit_out.flatten().map{r1-> tuple( "$r1".replaceAll(".*scsplit_",""), r1 )} - souporcell_out_ch = souporcell_out.flatten().map{r1-> tuple( "$r1".replaceAll(".*souporcell_",""), r1 )} + demuxlet_out_ch = demuxlet_out.flatten().map{r1-> tuple( "$r1".replaceAll(".*demuxlet_",""), r1 )} + freemuxlet_out_ch = freemuxlet_out.flatten().map{r1-> tuple( "$r1".replaceAll(".*freemuxlet_",""), r1 )} + vireo_out_ch = vireo_out.flatten().map{r1-> tuple( "$r1".replaceAll(".*vireo_",""), r1 )} + scSplit_out_ch = scSplit_out.flatten().map{r1-> tuple( "$r1".replaceAll(".*scsplit_",""), r1 )} + souporcell_out_ch = souporcell_out.flatten().map{r1-> tuple( "$r1".replaceAll(".*souporcell_",""), r1 )} - summary_input = input_list_summary.join(souporcell_out_ch,by:0,remainder: true).join(scSplit_out_ch,by:0,remainder: true).join(vireo_out_ch,by:0,remainder: true).join(freemuxlet_out_ch,by:0,remainder: true).join(demuxlet_out_ch,by:0,remainder: true) - summary_input = summary_input.filter{ it[0] != 'no_result' } + summary_input = input_list_summary.join(souporcell_out_ch,by:0,remainder: true).join(scSplit_out_ch,by:0,remainder: true).join(vireo_out_ch,by:0,remainder: true).join(freemuxlet_out_ch,by:0,remainder: true).join(demuxlet_out_ch,by:0,remainder: true) + summary_input = summary_input.filter{ it[0] != 'no_result' } - summary(summary_input, - params.generate_anndata, params.generate_mudata) + summary(summary_input, + params.generate_anndata, params.generate_mudata) - - emit: summary.out } diff --git a/modules/multi/hash_demultiplexing.nf b/modules/multi/hash_demultiplexing.nf index 06e3e1b..4f204e0 100644 --- a/modules/multi/hash_demultiplexing.nf +++ b/modules/multi/hash_demultiplexing.nf @@ -87,114 +87,112 @@ process summary{ } """ - summary_hash.py $demuxem_files $htodemux_files $multiseq_files $hashedDrops_files $hashsolo_files $gmmDemux_files $bff_files $generate_adata $generate_mdata --sampleId $sampleId + summary_hash.py $demuxem_files $htodemux_files $multiseq_files $hashedDrops_files $hashsolo_files $gmmDemux_files $bff_files $generate_adata $generate_mdata --sampleId $sampleId """ } workflow hash_demultiplexing{ - if (params.htodemux == "True"){ - Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, params.hto_matrix_htodemux == "raw" ? row.hto_matrix_raw : row.hto_matrix_filtered, - params.rna_matrix_htodemux == "raw" ? row.rna_matrix_raw : row.rna_matrix_filtered)} - | set {input_list_preprocess_htodemux} - preprocessing_hashing_htodemux(input_list_preprocess_htodemux, params.hto_matrix_htodemux, params.rna_matrix_htodemux) - htodemux_preprocess_out = preprocessing_hashing_htodemux.out - htodemux_hashing(htodemux_preprocess_out) - htodemux_out = htodemux_hashing.out - } + take: + input_channel + main: + + if (params.htodemux == "True"){ + input_channel.splitCsv(header:true).map { row-> tuple(row.sampleId, params.hto_matrix_htodemux == "raw" ? row.hto_matrix_raw : row.hto_matrix_filtered, + params.rna_matrix_htodemux == "raw" ? row.rna_matrix_raw : row.rna_matrix_filtered)}.set{input_list_preprocess_htodemux} + + preprocessing_hashing_htodemux(input_list_preprocess_htodemux, params.hto_matrix_htodemux, params.rna_matrix_htodemux) + htodemux_preprocess_out = preprocessing_hashing_htodemux.out + htodemux_hashing(htodemux_preprocess_out) + htodemux_out = htodemux_hashing.out + } + else{ + htodemux_out = channel.value("no_result") + } + + if (params.multiseq == "True"){ + if (params.htodemux == "True" & params.hto_matrix_htodemux == params.hto_matrix_multiseq & + params.rna_matrix_htodemux == params.rna_matrix_multiseq){ + multiseq_preprocess_out = htodemux_preprocess_out + } + else{ + input_channel.splitCsv(header:true).map { row-> tuple(row.sampleId, params.hto_matrix_multiseq == "raw" ? row.hto_matrix_raw : row.hto_matrix_filtered, + params.rna_matrix_multiseq == "raw" ? row.rna_matrix_raw : row.rna_matrix_filtered)}.set {input_list_preprocess_multiseq} + + preprocessing_hashing_multiseq(input_list_preprocess_multiseq, params.hto_matrix_multiseq, params.rna_matrix_multiseq) + multiseq_preprocess_out = preprocessing_hashing_multiseq.out + } + multiseq_hashing(multiseq_preprocess_out) + multiseq_out = multiseq_hashing.out + } + else{ + multiseq_out = channel.value("no_result") + } + + if (params.hashsolo == "True"){ + input_channel \ + | splitCsv(header:true) \ + | map { row-> tuple(row.sampleId, params.hto_matrix_hashsolo == "raw" ? row.hto_matrix_raw : row.hto_matrix_filtered, + params.rna_matrix_hashsolo == "False" ? channel.value("None") : + (params.rna_matrix_hashsolo == "raw" ? row.rna_matrix_raw : row.rna_matrix_filtered) + )} + | hash_solo_hashing + hashsolo_out = hash_solo_hashing.out + } + else{ + hashsolo_out = channel.value("no_result") + } + + if (params.demuxem == "True"){ + input_channel \ + | splitCsv(header:true) \ + | map { row-> tuple(row.sampleId, params.hto_matrix_demuxem == "raw" ? row.hto_matrix_raw : row.hto_matrix_filtered, + params.rna_matrix_demuxem == "raw" ? row.rna_matrix_raw : row.rna_matrix_filtered)} + | demuxem_hashing + demuxem_out = demuxem_hashing.out + } + else{ + demuxem_out = channel.value("no_result") + } + + if (params.hashedDrops == "True"){ + input_channel \ + | splitCsv(header:true) \ + | map { row-> tuple(row.sampleId, params.hto_matrix_hashedDrops == "raw" ? row.hto_matrix_raw : row.hto_matrix_filtered )} + | hashedDrops_hashing + hashedDrops_out = hashedDrops_hashing.out + } + else{ + hashedDrops_out = channel.value("no_result") + } + + + if (params.bff == "True"){ + input_channel \ + | splitCsv(header:true) \ + | map { row-> tuple(row.sampleId, params.hto_matrix_bff == "raw" ? row.hto_matrix_raw : row.hto_matrix_filtered )} + | bff_hashing + bff_out = bff_hashing.out + print("BFF path to output") + } else{ - htodemux_out = channel.value("no_result") - } - - if (params.multiseq == "True"){ - if (params.htodemux == "True" & params.hto_matrix_htodemux == params.hto_matrix_multiseq & - params.rna_matrix_htodemux == params.rna_matrix_multiseq){ - multiseq_preprocess_out = htodemux_preprocess_out + bff_out = channel.value("no_result") + } + if (params.gmmDemux == "True"){ + input_channel \ + | splitCsv(header:true) \ + | map { row-> tuple(row.sampleId, params.hto_matrix_gmm_demux == "raw" ? row.hto_matrix_raw : row.hto_matrix_filtered, row.hto_name_gmm )} + | gmm_demux_hashing + gmmDemux_out = gmm_demux_hashing.out } else{ - Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, params.hto_matrix_multiseq == "raw" ? row.hto_matrix_raw : row.hto_matrix_filtered, - params.rna_matrix_multiseq == "raw" ? row.rna_matrix_raw : row.rna_matrix_filtered)} - | set {input_list_preprocess_multiseq} - preprocessing_hashing_multiseq(input_list_preprocess_multiseq, params.hto_matrix_multiseq, params.rna_matrix_multiseq) - multiseq_preprocess_out = preprocessing_hashing_multiseq.out - } - multiseq_hashing(multiseq_preprocess_out) - multiseq_out = multiseq_hashing.out - } - else{ - multiseq_out = channel.value("no_result") - } - - if (params.hashsolo == "True"){ - Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, params.hto_matrix_hashsolo == "raw" ? row.hto_matrix_raw : row.hto_matrix_filtered, - params.rna_matrix_hashsolo == "False" ? channel.value("None") : - (params.rna_matrix_hashsolo == "raw" ? row.rna_matrix_raw : row.rna_matrix_filtered) - )} - | hash_solo_hashing - hashsolo_out = hash_solo_hashing.out - } - else{ - hashsolo_out = channel.value("no_result") - } - - if (params.demuxem == "True"){ - Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, params.hto_matrix_demuxem == "raw" ? row.hto_matrix_raw : row.hto_matrix_filtered, - params.rna_matrix_demuxem == "raw" ? row.rna_matrix_raw : row.rna_matrix_filtered)} - | demuxem_hashing - demuxem_out = demuxem_hashing.out - } - else{ - demuxem_out = channel.value("no_result") - } - - if (params.hashedDrops == "True"){ - Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, params.hto_matrix_hashedDrops == "raw" ? row.hto_matrix_raw : row.hto_matrix_filtered )} - | hashedDrops_hashing - hashedDrops_out = hashedDrops_hashing.out - } - else{ - hashedDrops_out = channel.value("no_result") - } - + gmmDemux_out = channel.value("no_result") + } - if (params.bff == "True"){ - Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, params.hto_matrix_bff == "raw" ? row.hto_matrix_raw : row.hto_matrix_filtered )} - | bff_hashing - bff_out = bff_hashing.out - print("BFF path to output") - } - else{ - bff_out = channel.value("no_result") - } - if (params.gmmDemux == "True"){ - Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, params.hto_matrix_gmm_demux == "raw" ? row.hto_matrix_raw : row.hto_matrix_filtered, row.hto_name_gmm )} - | gmm_demux_hashing - gmmDemux_out = gmm_demux_hashing.out - } - else{ - gmmDemux_out = channel.value("no_result") - } + input_channel.splitCsv(header:true).map { row-> tuple(row.sampleId, file(row.hto_matrix_filtered), file(row.rna_matrix_filtered))}.set {input_list_summary} - Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, file(row.hto_matrix_filtered), file(row.rna_matrix_filtered))} - | set {input_list_summary} - summary(input_list_summary, demuxem_out, hashsolo_out, htodemux_out, multiseq_out, hashedDrops_out,bff_out,gmmDemux_out, params.generate_anndata, params.generate_mudata) - + summary(input_list_summary, demuxem_out, hashsolo_out, htodemux_out, multiseq_out, hashedDrops_out,bff_out,gmmDemux_out, params.generate_anndata, params.generate_mudata) + emit: summary.out } diff --git a/modules/multi/preprocessing/preprocessing.nf b/modules/multi/preprocessing/preprocessing.nf new file mode 100644 index 0000000..fd1ddae --- /dev/null +++ b/modules/multi/preprocessing/preprocessing.nf @@ -0,0 +1,34 @@ +process create_single_chanel_input{ + + tag "${sampleId}" + conda "$projectDir/conda/scsplit.yml" + + input: + val sample_name + val hto_matrix_raw + val hto_matrix_filtered + val rna_matrix_raw + val rna_matrix_filtered + val bam + val bai + val barcodes + val fasta + val fasta_index + val nsample + val cell_data + val vcf_mixed + val vcf_donor + val vireo_parent_dir + val demultiplexing_result + output: + path('hadge_single_input.csv'), emit: input_channel + + script: + """ + echo "sampleId,rna_matrix_raw,rna_matrix_filtered,hto_matrix_raw,hto_matrix_filtered,bam,bam_index,barcodes,nsample,cell_data,vcf_mixed,vcf_donor,vireo_parent_dir,demultiplexing_result" > hadge_single_input.csv + echo "${sample_name},${rna_matrix_raw},${rna_matrix_filtered},${hto_matrix_raw},${hto_matrix_filtered},${bam},${bai},${barcodes},${nsample},${cell_data},${vcf_mixed},${vcf_donor},${vireo_parent_dir},${demultiplexing_result}" >> hadge_single_input.csv + """ + +} + + diff --git a/modules/multi_demultiplexing.nf b/modules/multi_demultiplexing.nf index 6dc6010..1e99bbb 100644 --- a/modules/multi_demultiplexing.nf +++ b/modules/multi_demultiplexing.nf @@ -59,88 +59,66 @@ process summary_all{ script: """ - summary.py --gene_demulti $gene_demulti_result --hash_demulti $hash_demulti_result + summary.py --gene_demulti $gene_demulti_result --hash_demulti $hash_demulti_result """ } workflow run_multi{ + take: + input_channel + main: - print("-----Running multiple samples-----") + if (params.mode == "genetic"){ + // Performing genetic demultiplexing methodologies + gene_demultiplexing(input_channel) + //////////// - if (params.mode == "genetic"){ + if (params.match_donor == "True"){ - // Performing genetic demultiplexing methodologies - gene_demultiplexing() - //////////// + input_channel.splitCsv(header:true).map { row-> tuple(row.sampleId, row.nsample, row.barcodes, "None", "None")}.join(gene_demultiplexing.out).set{dm_input} + } + } + else if (params.mode == "hashing"){ + + // Performing hashing demultplexing + hash_demultiplexing(input_channel) + //////////// + + if (params.match_donor == "True"){ + input_channel.splitCsv(header:true).map { row -> tuple(row.sampleId, row.nsample, row.barcodes, "None", "None")}.join(hash_demultiplexing.out).set{dm_input} + } + } + else if (params.mode == "rescue"){ - if (params.match_donor == "True"){ + // Performing both hashing and genetic demultiplexing methods + hash_demultiplexing(input_channel) + gene_demultiplexing(input_channel) + //////////// + gene_summary = gene_demultiplexing.out + hash_summary = hash_demultiplexing.out + input_summary_all = gene_summary.join(hash_summary) + summary_all(input_summary_all) - gene_demultiplexing.out.view() - Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.nsample, row.barcodes, "None", "None")} - | join(gene_demultiplexing.out) - | donor_match - } - } - else if (params.mode == "hashing"){ + if (params.match_donor == "True"){ + input_channel.splitCsv(header:true).map { row -> tuple(row.sampleId, row.nsample, row.barcodes, "None", "None")}.join(summary_all.out).set{dm_input} + } - // Performing hashing demultplexing - hash_demultiplexing() - //////////// - - if (params.match_donor == "True"){ - Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row -> tuple(row.sampleId, row.nsample, row.barcodes, "None", "None")} - | join(hash_demultiplexing.out) - | donor_match } - } - else if (params.mode == "rescue"){ - - // Performing both hashing and genetic demultiplexing methods - hash_demultiplexing() - gene_demultiplexing() - //////////// - - gene_summary = gene_demultiplexing.out - hash_summary = hash_demultiplexing.out - input_summary_all = gene_summary.join(hash_summary) - summary_all(input_summary_all) - - if (params.match_donor == "True"){ - Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row -> tuple(row.sampleId, row.nsample, row.barcodes, "None", "None")} - | join(summary_all.out) - | donor_match + else if (params.mode == "donor_match"){ + // Performing just donor matching + input_channel.splitCsv(header:true).map { row -> tuple(row.sampleId, row.nsample, row.barcodes, row.celldata, row.vireo_parent_dir, row.demultiplexing_result)}.set{dm_input} } - if (params.generate_anndata == "True" || params.generate_mudata == "True" ){ - Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row -> tuple(row.sampleId, row.hto_matrix_filtered, row.rna_matrix_filtered)} - | join(donor_match.out) - | set {input_generate_data} - generate_data(input_generate_data, params.generate_anndata, params.generate_mudata) - } - } - else if (params.mode == "donor_match"){ - - // Performing just donor matching - Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row -> tuple(row.sampleId, row.nsample, row.barcodes, row.celldata, row.vireo_parent_dir, row.demultiplexing_result)} \ - | donor_match - if (params.generate_anndata == "True" || params.generate_mudata == "True" ){ - Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row -> tuple(row.sampleId, row.hto_matrix_filtered, row.rna_matrix_filtered)} - | join(donor_match.out) - | set {input_generate_data} - generate_data(input_generate_data, params.generate_anndata, params.generate_mudata) + + + if (params.match_donor == "True" || params.mode == "donor_match"){ + donor_match(dm_input) + + if (params.generate_anndata == "True" || params.generate_mudata == "True" ){ + input_channel.splitCsv(header:true).map { row -> tuple(row.sampleId, row.hto_matrix_filtered, row.rna_matrix_filtered)}.join(donor_match.out).set{input_generate_data} + generate_data(input_generate_data, params.generate_anndata, params.generate_mudata) + } } - } + } diff --git a/nextflow.config b/nextflow.config index 78e3a0f..5af7260 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,11 +1,11 @@ params { - + subset_bam_to_comon_variants= true + cell_data="None" // Nf core integration custom_config_version = 'master' custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" - - + sample_name='hadge_process' outdir = "${launchDir}/results" mode = "rescue" generate_anndata = "True" @@ -395,16 +395,22 @@ profiles { // queue = ... // clusterOptions = ... withLabel: big_mem { - cpus = 32 - memory = 150.GB + cpus = 16 + memory = {30.GB * task.attempt} time =24.h } withLabel: small_mem { - cpus = 16 - memory = 100.GB + cpus = 6 + memory = {20.GB * task.attempt} time =24.h } + + withName: subset_bam_to_comon_variants{ + cpus = 2 + memory = {20.GB * task.attempt} + } + withName: summary{ cpus = 1 memory = {10.GB * task.attempt} diff --git a/subworkflows/HADGE.nf b/subworkflows/HADGE.nf index 5437a4b..2ce5c35 100644 --- a/subworkflows/HADGE.nf +++ b/subworkflows/HADGE.nf @@ -4,16 +4,39 @@ include { run_multi } from "$projectDir/modules/multi_demultiplexing" include {run_single} from "$projectDir/modules/single_demultiplexing" include { summary } from "$projectDir/modules/multi/gene_demultiplexing" include { donor_match } from "$projectDir/modules/multi/donor_match" +include {create_single_chanel_input} from "$projectDir/modules/multi/preprocessing/preprocessing" + workflow HADGE { // Here we decide if it is a single sample demultiplexing or multi input demutliplexing run. if (params.multi_input == null){ // Single Mode - run_single() + // Instead of running in single here we want to run a multi so that the chanels, workflows and processes are not redundant. + create_single_chanel_input( + params.sample_name, + params.hto_matrix_raw, + params.hto_matrix_filtered, + params.rna_matrix_raw, + params.rna_matrix_filtered, + params.bam, + params.bai, + params.barcodes, + params.fasta, + params.fasta_index, + params.nsample, + params.cell_data, + params.vcf_mixed, + params.vcf_donor, + params.vireo_parent_dir, + params.demultiplexing_result, + ) + run_multi(create_single_chanel_input.out.input_channel) + } else{ // Multi mode - run_multi() + input_channel = Channel.fromPath(params.multi_input) + run_multi(input_channel) } } @@ -21,12 +44,12 @@ workflow HADGE { workflow SUMMARY{ - Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, file(row.hto_matrix_filtered), file(row.rna_matrix_filtered))} - | set {input_list_summary} + log.info('running summary only') + Channel.fromPath(params.multi_input).splitCsv(header:true).map { row-> tuple(row.sampleId, file(row.hto_matrix_filtered), file(row.rna_matrix_filtered))}.set {input_list_summary} + + demuxlet_out = Channel.fromPath("${params.outdir}/*/genetic/gene_demulti/demuxlet/demuxlet_*", type: 'dir').collect().ifEmpty('no_result') freemuxlet_out= Channel.fromPath("${params.outdir}/*/genetic/gene_demulti/freemuxlet/freemuxlet_*", type: 'dir').collect().ifEmpty('no_result') vireo_out= Channel.fromPath("${params.outdir}/*/genetic/gene_demulti/vireo/vireo_*", type: 'dir').collect().ifEmpty('no_result') @@ -45,10 +68,10 @@ workflow SUMMARY{ summary(summary_input, params.generate_anndata, params.generate_mudata) - Channel.fromPath(params.multi_input) \ - | splitCsv(header:true) \ - | map { row-> tuple(row.sampleId, row.nsample, row.barcodes, "None", "None")} - | join(summary.out) - | donor_match + // Channel.fromPath(params.multi_input) \ + // | splitCsv(header:true) \ + // | map { row-> tuple(row.sampleId, row.nsample, row.barcodes, "None", "None")} + // | join(summary.out) + // | donor_match } \ No newline at end of file diff --git a/test_data/download_data.sh b/test_data/download_data.sh old mode 100644 new mode 100755 index ea3aa9b..14f9802 --- a/test_data/download_data.sh +++ b/test_data/download_data.sh @@ -6,7 +6,7 @@ wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1xl9g wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1jlhEO1Z7YGYVnxv1YO9arjDwGFeZbfkr' -O jurkat_293t_downsampled_n500_full_bam.bam.bai FILEID="13CV6CjP9VzmwG5MVHbJiVDMVdiIhGdJB" FILENAME="jurkat_293t_downsampled_n500_full_bam.bam" -wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=$FILEID' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=$FILEID" -O $FILENAME && rm -rf /tmp/cookies.txt +wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=13CV6CjP9VzmwG5MVHbJiVDMVdiIhGdJB' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=$FILEID" -O $FILENAME && rm -rf /tmp/cookies.txt wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1MmwEiOsdzEfRdXS6oXXBwMJXUovKWcni' -O final_res.zip unzip final_res.zip rm final_res.zip diff --git a/test_data/hto/barcodes.tsv b/test_data/hto/barcodes.tsv old mode 100644 new mode 100755 diff --git a/test_data/hto/genes.tsv b/test_data/hto/genes.tsv old mode 100644 new mode 100755 diff --git a/test_data/hto/matrix.mtx b/test_data/hto/matrix.mtx old mode 100644 new mode 100755 diff --git a/test_data/multi_sample_input.csv b/test_data/multi_sample_input.csv old mode 100644 new mode 100755 diff --git a/test_data/rna/barcodes.tsv b/test_data/rna/barcodes.tsv old mode 100644 new mode 100755 diff --git a/test_data/rna/genes.tsv b/test_data/rna/genes.tsv old mode 100644 new mode 100755 diff --git a/test_data/rna/matrix.mtx b/test_data/rna/matrix.mtx old mode 100644 new mode 100755 diff --git a/test_data/simulation.R b/test_data/simulation.R old mode 100644 new mode 100755 From a37128305df6ed0f39c0a53d9271af647aa5a05e Mon Sep 17 00:00:00 2001 From: Matiss Ozols Date: Fri, 2 Feb 2024 11:32:59 +0000 Subject: [PATCH 06/16] removed single scripts as they are redundant and dublicate efforts in maintaining pipeline --- bin/bff.R | 12 +- conda/bff.yml | 2 + conda/condaenv.ucda_93c.requirements.txt | 3 + modules/multi/hash_demulti/demuxem.nf | 2 +- modules/single/donor_match.nf | 105 -------- modules/single/gene_demulti/bcftools.nf | 33 --- modules/single/gene_demulti/cellsnp.nf | 120 --------- modules/single/gene_demulti/demuxlet.nf | 159 ------------ modules/single/gene_demulti/freebayes.nf | 287 --------------------- modules/single/gene_demulti/freemuxlet.nf | 148 ----------- modules/single/gene_demulti/samtools.nf | 38 --- modules/single/gene_demulti/scsplit.nf | 99 ------- modules/single/gene_demulti/souporcell.nf | 106 -------- modules/single/gene_demulti/vireo.nf | 116 --------- modules/single/gene_demultiplexing.nf | 194 -------------- modules/single/hash_demulti/bff.nf | 82 ------ modules/single/hash_demulti/demuxem.nf | 70 ----- modules/single/hash_demulti/gmm_demux.nf | 82 ------ modules/single/hash_demulti/hashedDrops.nf | 108 -------- modules/single/hash_demulti/hashsolo.nf | 65 ----- modules/single/hash_demulti/htodemux.nf | 114 -------- modules/single/hash_demulti/multiseq.nf | 62 ----- modules/single/hash_demulti/preprocess.nf | 65 ----- modules/single/hash_demultiplexing.nf | 179 ------------- modules/single_demultiplexing.nf | 118 --------- subworkflows/HADGE.nf | 1 - 26 files changed, 17 insertions(+), 2353 deletions(-) create mode 100644 conda/condaenv.ucda_93c.requirements.txt delete mode 100644 modules/single/donor_match.nf delete mode 100644 modules/single/gene_demulti/bcftools.nf delete mode 100644 modules/single/gene_demulti/cellsnp.nf delete mode 100755 modules/single/gene_demulti/demuxlet.nf delete mode 100644 modules/single/gene_demulti/freebayes.nf delete mode 100755 modules/single/gene_demulti/freemuxlet.nf delete mode 100644 modules/single/gene_demulti/samtools.nf delete mode 100644 modules/single/gene_demulti/scsplit.nf delete mode 100755 modules/single/gene_demulti/souporcell.nf delete mode 100755 modules/single/gene_demulti/vireo.nf delete mode 100644 modules/single/gene_demultiplexing.nf delete mode 100644 modules/single/hash_demulti/bff.nf delete mode 100644 modules/single/hash_demulti/demuxem.nf delete mode 100644 modules/single/hash_demulti/gmm_demux.nf delete mode 100755 modules/single/hash_demulti/hashedDrops.nf delete mode 100755 modules/single/hash_demulti/hashsolo.nf delete mode 100644 modules/single/hash_demulti/htodemux.nf delete mode 100644 modules/single/hash_demulti/multiseq.nf delete mode 100644 modules/single/hash_demulti/preprocess.nf delete mode 100644 modules/single/hash_demultiplexing.nf delete mode 100644 modules/single_demultiplexing.nf diff --git a/bin/bff.R b/bin/bff.R index b0ea7af..a5a5ad8 100755 --- a/bin/bff.R +++ b/bin/bff.R @@ -4,11 +4,21 @@ library(DropletUtils) library(Seurat) library(ggplot2) library(cowplot) +if(!require("cellhashR")){ + devtools::install_github(repo = 'bimberlab/cellhashR', ref = 'master', dependencies = TRUE, upgrade = 'always') + library("cellhashR") +} + library(cellhashR) library(here) library(dplyr) library(argparse) -library(tidyverse) + +if(!require("tidyverse")){ + install.packages("tidyverse") + library("tidyverse") +} + # Create a parser parser <- ArgumentParser("Parameters for BFF") diff --git a/conda/bff.yml b/conda/bff.yml index 295e53b..e014dff 100644 --- a/conda/bff.yml +++ b/conda/bff.yml @@ -3,6 +3,7 @@ channels: - conda-forge - bioconda dependencies: + - conda-forge::r-tidyverse - conda-forge::r-base>=4.2 - r-seurat - bioconductor-dropletutils @@ -17,3 +18,4 @@ dependencies: - conda-forge::r-here - conda-forge::r-argparse - conda-forge::r-dplyr + diff --git a/conda/condaenv.ucda_93c.requirements.txt b/conda/condaenv.ucda_93c.requirements.txt new file mode 100644 index 0000000..4b3fe5b --- /dev/null +++ b/conda/condaenv.ucda_93c.requirements.txt @@ -0,0 +1,3 @@ +GMM_Demux==0.2.1.3 +scikit-learn==1.1.3 +argparse \ No newline at end of file diff --git a/modules/multi/hash_demulti/demuxem.nf b/modules/multi/hash_demulti/demuxem.nf index 1b542f1..c96443b 100644 --- a/modules/multi/hash_demulti/demuxem.nf +++ b/modules/multi/hash_demulti/demuxem.nf @@ -5,7 +5,7 @@ process demuxem{ publishDir "$params.outdir/$sampleId/$params.mode/hash_demulti/demuxem", mode:'copy' label 'small_mem' - conda "bioconda::pegasuspy bioconda::scanpy bioconda::demuxEM" + conda "bioconda::pegasuspy demuxEM scanpy" input: tuple val(sampleId), path(raw_hto_matrix_dir, stageAs: "hto_data_${params.hto_matrix_demuxem}"), diff --git a/modules/single/donor_match.nf b/modules/single/donor_match.nf deleted file mode 100644 index 39f0a0b..0000000 --- a/modules/single/donor_match.nf +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env nextflow -nextflow.enable.dsl=2 - -process matchDonor{ - publishDir "$projectDir/$params.outdir/$params.mode", mode: 'copy' - label 'big_mem' - - conda "$projectDir/conda/donor_match.yml" - - input: - path demultiplexing_result - val ndonor - path barcode_whitelist - val method1_name - val method2_name - val findVariants - val cell_genotype - val variant_count - val variant_pct - val vireo_parent_dir - - output: - path "donor_match" - - script: - def two_method = (method1_name != "None" & method2_name != "None") ? "--method1 $method1_name --method2 $method2_name" : "" - - def cell_genotype_path = "" - if (findVariants == "True" | findVariants == "default"){ - cell_genotype_path = cell_genotype != "None" ? "--cell_genotype $cell_genotype" : \ - "--cell_genotype $projectDir/$params.outdir/$params.mode/gene_demulti/cellSNP/cellsnp_1/*/cellSNP.cells.vcf.gz" - } - - def vireo_parent_path = "" - if ( findVariants == 'vireo' | findVariants == 'True' ){ - vireo_parent_path = (params.mode == "donor_match" & vireo_parent_dir != "None") ? "--vireo_parent_dir $vireo_parent_dir" : \ - "--vireo_parent_dir $projectDir/$params.outdir/$params.mode/gene_demulti/vireo/" - } - - def barcode_whitelist_path = (barcode_whitelist != "None") ? "--barcode $barcode_whitelist" : "" - - """ - export R_MAX_VSIZE=100Gb - outputdir=donor_match - mkdir -p \$outputdir - donor_match.R --result_csv $demultiplexing_result $barcode_whitelist_path --findVariants $findVariants \ - $cell_genotype_path --variant_pct $variant_pct --variant_count $variant_count --ndonor $ndonor \ - $two_method --outputdir \$outputdir $vireo_parent_path - - if ([ "$findVariants" != "False" ]); then - best_method_vireo="\$(head -n 1 \$outputdir/best_method_vireo.txt)" - if ([ "$params.mode" != "donor_match" ]); then - donor_genotype="\$(find $projectDir/$params.outdir/$params.mode/gene_demulti/vireo/\$best_method_vireo -name GT_donors.vireo.vcf.gz | head -n 1)" - else - donor_genotype="\$(find $vireo_parent_dir/\$best_method_vireo -name GT_donors.vireo.vcf.gz | head -n 1)" - fi - - if ([ "$findVariants" = "True" ] || [ "$findVariants" = "default" ]); then - gunzip -c \$donor_genotype > \$outputdir/GT_donors.vireo.vcf - if ([ \$(grep "^##config" \$outputdir/GT_donors.vireo.vcf | wc -l) == 0 ]); then - bcftools view --header-only $cell_genotype | grep "^##" | grep -v "^##bcftools" > \$outputdir/donor_with_header.vcf - grep -v "^##" \$outputdir/GT_donors.vireo.vcf >> \$outputdir/donor_with_header.vcf - bcftools sort \$outputdir/donor_with_header.vcf -Oz -o \$outputdir/compressed_sorted_donor_genotype.vcf.gz - rm \$outputdir/donor_with_header.vcf - else - bcftools sort \$donor_genotype -Oz -o \$outputdir/compressed_sorted_donor_genotype.vcf.gz - fi - bcftools index \$outputdir/compressed_sorted_donor_genotype.vcf.gz - bcftools filter \$outputdir/compressed_sorted_donor_genotype.vcf.gz -R \$outputdir/donor_specific_variants.csv > \$outputdir/donor_genotype_subset_by_default.vcf - bcftools reheader --samples \$outputdir/donor_match.csv -o \$outputdir/donor_genotype_subset_by_default_matched.vcf \$outputdir/donor_genotype_subset_by_default.vcf - - rm \$outputdir/GT_donors.vireo.vcf - fi - - if ([ "$findVariants" = "True" ]); then - bcftools filter \$outputdir/compressed_sorted_donor_genotype.vcf.gz -R \$outputdir/representative_variants_vireo.csv > \$outputdir/donor_genotype_subset_by_vireo.vcf - bcftools reheader --samples \$outputdir/donor_match.csv -o \$outputdir/donor_genotype_subset_by_vireo_matched.vcf \$outputdir/donor_genotype_subset_by_vireo.vcf - fi - - if ([ "$findVariants" = "vireo" ]); then - bcftools sort \$donor_genotype -Oz -o \$outputdir/compressed_sorted_donor_genotype.vcf.gz - bcftools index \$outputdir/compressed_sorted_donor_genotype.vcf.gz - bcftools filter \$outputdir/compressed_sorted_donor_genotype.vcf.gz -R \$outputdir/representative_variants_vireo.csv > \$outputdir/donor_genotype_subset_by_vireo.vcf - bcftools reheader --samples \$outputdir/donor_match.csv -o \$outputdir/donor_genotype_subset_by_vireo_matched.vcf \$outputdir/donor_genotype_subset_by_vireo.vcf - fi - - rm \$outputdir/best_method_vireo.txt - rm \$outputdir/compressed_sorted_donor_genotype.vcf.gz - rm \$outputdir/compressed_sorted_donor_genotype.vcf.gz.csi - - fi - - """ -} - - -workflow donor_match{ - take: - demultiplexing_result - main: - matchDonor(demultiplexing_result, params.nsample, params.barcodes, params.match_donor_method1, params.match_donor_method2, - params.findVariants, params.celldata, params.variant_count, params.variant_pct, params.vireo_parent_dir) - emit: - matchDonor.out -} diff --git a/modules/single/gene_demulti/bcftools.nf b/modules/single/gene_demulti/bcftools.nf deleted file mode 100644 index d53efe7..0000000 --- a/modules/single/gene_demulti/bcftools.nf +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env nextflow -nextflow.enable.dsl=2 -process bcftools{ - publishDir "$projectDir/$params.outdir/$params.mode/gene_demulti/bcftools", mode: 'copy' - label 'big_mem' - - conda "bioconda::bcftools=1.9" - - input: - val vcf - output: - path "bcftools_${task.index}" - - script: - vcf_files = vcf.join(" ") - """ - mkdir bcftools_${task.index} - bcftools concat -o bcftools_${task.index}/total_chroms.vcf ${vcf_files} - cd bcftools_${task.index} - bcftools sort total_chroms.vcf -o sorted_total_chroms.vcf - bcftools filter -i '%QUAL > 30' sorted_total_chroms.vcf -o filtered_sorted_total_chroms.vcf - """ - -} -workflow filter_variant{ - take: - vcf - main: - bcftools(vcf) - emit: - bcftools.out -} - diff --git a/modules/single/gene_demulti/cellsnp.nf b/modules/single/gene_demulti/cellsnp.nf deleted file mode 100644 index 7768740..0000000 --- a/modules/single/gene_demulti/cellsnp.nf +++ /dev/null @@ -1,120 +0,0 @@ -#!/usr/bin/env nextflow -nextflow.enable.dsl=2 - -process cellSNP{ - publishDir "$projectDir/$params.outdir/$params.mode/gene_demulti/cellSNP", mode: 'copy' - label 'big_mem' - - conda "bioconda::cellsnp-lite" - - input: - path samFile_cellSNP - path indexFile_cellSNP - val regionsVCF_cellSNP - val targetsVCF_cellSNP - path barcodeFile_cellSNP - val sampleList_cellSNP - val sampleIDs_cellSNP - val genotype_cellSNP - val gzip_cellSNP - val printSkipSNPs_cellSNP - val nproc_cellSNP - val refseq_cellSNP - val chrom_cellSNP - val cellTAG_cellSNP - val UMItag_cellSNP - val minCOUNT_cellSNP - val minMAF_cellSNP - val doubletGL_cellSNP - val inclFLAG_cellSNP - val exclFLAG_cellSNP - val minLEN_cellSNP - val minMAPQ_cellSNP - val countORPHAN_cellSNP - val cellsnp_out - - - output: - path "cellsnp_${task.index}" - - script: - def samFile = samFile_cellSNP.name != 'None' ? "--samFile ${samFile_cellSNP}" : '' - def regionsVCF = regionsVCF_cellSNP != 'None' ? "--regionsVCF ${regionsVCF_cellSNP}" : '' - def targetsVCF = targetsVCF_cellSNP != 'None' ? "--targetsVCF ${targetsVCF_cellSNP}" : '' - def barcodeFile = barcodeFile_cellSNP.name != 'None' ? "--barcodeFile ${barcodeFile_cellSNP}" : '' - def sampleList = sampleList_cellSNP != 'None' ? "--sampleList ${sampleList_cellSNP}" : '' - def sampleIDs = sampleIDs_cellSNP != 'None' ? "--sampleIDs ${sampleIDs_cellSNP}" : '' - def genotype = genotype_cellSNP != 'False' ? "--genotype" : '' - def gzip = gzip_cellSNP != 'False' ? "--gzip" : '' - def printSkipSNPs = printSkipSNPs_cellSNP != 'False' ? "--printSkipSNPs" : '' - def nproc = nproc_cellSNP != 'None' ? "--nproc ${nproc_cellSNP}" : '' - def refseq = refseq_cellSNP != 'None' ? "--refseq ${refseq_cellSNP}" : '' - def chrom = chrom_cellSNP != 'None' ? "--chrom ${chrom_cellSNP}" : '' - def cellTAG = "--cellTAG ${cellTAG_cellSNP}" - def UMItag = "--UMItag ${UMItag_cellSNP}" - def minCOUNT = "--minCOUNT ${minCOUNT_cellSNP}" - def minMAF = "--minMAF ${minMAF_cellSNP}" - def doubletGL = doubletGL_cellSNP != 'False' ? "--doubletGL" : '' - def inclFLAG = inclFLAG_cellSNP != 'None' ? "--inclFLAG ${inclFLAG_cellSNP}" : '' - def exclFLAG = exclFLAG_cellSNP != 'None' ? "--exclFLAG ${exclFLAG_cellSNP}" : '' - def minLEN = "--minLEN ${minLEN_cellSNP}" - def minMAPQ = "--minMAPQ ${minMAPQ_cellSNP}" - def countORPHAN = countORPHAN_cellSNP != 'False' ? "--countORPHAN" : '' - def out = "cellsnp_${task.index}/${cellsnp_out}" - - """ - mkdir cellsnp_${task.index} - mkdir $out - touch cellsnp_${task.index}/params.csv - echo -e "Argument,Value \n samfile,${samFile_cellSNP} \n regionsVCF,${regionsVCF_cellSNP} \n targetsVCF,${targetsVCF_cellSNP} \n barcodeFile,${barcodeFile_cellSNP} \n sampleList,${sampleList_cellSNP} \n sampleIDs,${sampleIDs_cellSNP} \n genotype,${genotype_cellSNP} \n gzip,${gzip_cellSNP} \n printSkipSNPs,${printSkipSNPs_cellSNP} \n nproc,${nproc_cellSNP} \n refseq,${refseq_cellSNP} \n chrom,${chrom_cellSNP} \n cellTAG,${cellTAG_cellSNP} \n UMItag,${UMItag_cellSNP} \n minCOUNT,${minCOUNT_cellSNP} \n minMAF,${minMAF_cellSNP} \n doubletGL,${doubletGL_cellSNP} \n inclFLAG,${inclFLAG_cellSNP} \n exclFLAG,${exclFLAG_cellSNP} \n minLEN,${minLEN_cellSNP} \n minMAPQ,${minMAPQ_cellSNP} \n countORPHAN,${countORPHAN_cellSNP}" >> cellsnp_${task.index}/params.csv - cellsnp-lite $samFile $regionsVCF $targetsVCF $barcodeFile $sampleList $sampleIDs \ - $genotype $gzip $printSkipSNPs $nproc $refseq $chrom $cellTAG $UMItag $minCOUNT $minMAF $doubletGL \ - $inclFLAG $exclFLAG $minLEN $minMAPQ $countORPHAN --outDir $out - cd $out - gunzip -c cellSNP.cells.vcf.gz > cellSNP.cells.vcf - """ -} - - -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() - } - else{ - Channel.from(input) - } -} - -workflow variant_cellSNP{ - take: - samFile - indexFile - main: - barcodeFile = channel.fromPath(params.barcodes) - regionsVCF = channel.value(params.common_variants_cellsnp) - targetsVCF = channel.value(params.targetsVCF) - sampleList = channel.value(params.sampleList) - sampleIDs = channel.value(params.sampleIDs) - genotype_cellSNP = channel.value(params.genotype_cellSNP) - gzip_cellSNP = channel.value(params.gzip_cellSNP) - printSkipSNPs = channel.value(params.printSkipSNPs) - nproc_cellSNP = channel.value(params.nproc_cellSNP) - refseq_cellSNP = channel.value(params.refseq_cellSNP) - chrom = channel.value(params.chrom) - cellTAG = channel.value(params.cellTAG) - UMItag = channel.value(params.UMItag) - minCOUNT = channel.value(params.minCOUNT) - minMAF = channel.value(params.minMAF) - doubletGL = channel.value(params.doubletGL) - inclFLAG = channel.value(params.inclFLAG) - exclFLAG = channel.value(params.exclFLAG) - minLEN = channel.value(params.minLEN) - minMAPQ = channel.value(params.minMAPQ) - countORPHAN = channel.value(params.countORPHAN) - cellsnp_out = channel.value(params.cellsnp_out) - cellSNP(samFile, indexFile, regionsVCF, targetsVCF, barcodeFile, sampleList, - sampleIDs, genotype_cellSNP, gzip_cellSNP, printSkipSNPs, nproc_cellSNP, refseq_cellSNP, - chrom, cellTAG, UMItag, minCOUNT, minMAF, doubletGL, inclFLAG, exclFLAG, minLEN, minMAPQ, countORPHAN, cellsnp_out) - emit: - cellSNP.out -} diff --git a/modules/single/gene_demulti/demuxlet.nf b/modules/single/gene_demulti/demuxlet.nf deleted file mode 100755 index 4cfab8a..0000000 --- a/modules/single/gene_demulti/demuxlet.nf +++ /dev/null @@ -1,159 +0,0 @@ -#!/usr/bin/env nextflow - -nextflow.enable.dsl=2 - -process demuxlet { - publishDir "$projectDir/$params.outdir/$params.mode/gene_demulti/demuxlet", mode: 'copy' - label 'small_mem' - - conda "bioconda::popscle bioconda::samtools bioconda::bedtools " - - input: - each sam - each tag_group - each tag_UMI - each sm - each sm_list - each sam_verbose - each vcf_verbose - each skip_umi - - each cap_BQ - each min_BQ - each min_MQ - each min_TD - each excl_flag - - each group_list - each min_total - each min_uniq - each min_umi - each min_snp - - each plp - each vcf_donor - each field - each geno_error_offset - each geno_error_coeff - each r2_info - each min_mac - each min_callrate - - each alpha - each doublet_prior - each demuxlet_out - - - output: - path "demuxlet_${task.index}" - - script: - def samfile = "--sam $sam" - def taggroup = tag_group != 'None' ? "--tag-group ${tag_group}" : '' - def tagUMI = tag_UMI != 'None' ? "--tag-UMI ${tag_UMI}" : '' - def vcfref = plp == 'True' ? "--vcf vcfref.vcf" : "" - def vcfref_sort ="sort_vcf_same_as_bam.sh ${sam} ${vcf_donor} > vcfref.vcf" - def vcfref_name = plp == 'True' ? 'vcfref.vcf' : "No VCF Ref is used because plp is not performed." - def smlist = sm != 'None' ? "--sm $sm" : '' - def sm_list_file = sm_list != 'None' ? "--sm-list ${sm_list}" : '' - def sm_list_file_name = sm_list != 'None' ? file(sm_list).baseName : "No sm list file is given" - def samverbose = "--sam-verbose ${sam_verbose}" - def vcfverbose = "--vcf-verbose ${vcf_verbose}" - def skipumi = skip_umi != "False" ? "--skip-umi" : "" - def capBQ = "--cap-BQ ${cap_BQ}" - def minBQ = "--min-BQ ${min_BQ}" - def minMQ = "--min-MQ ${min_MQ}" - def minTD = "--min-TD ${min_TD}" - def exclflag = "--excl-flag ${excl_flag}" - def grouplist = group_list != 'None' ? "--group-list ${group_list}" : '' - def mintotal = "--min-total ${min_total}" - def minumi = "--min-umi ${min_umi}" - def minuniq = "--min-uniq ${min_uniq}" - def minsnp = "--min-snp ${min_snp}" - def plp_name = plp == 'True' ? "plp performed" : "plp not performed" - def vcfdonor = "--vcf vcfref.vcf" - def fieldinfo = "--field $field" - def genoerror_off = "--geno-error-offset ${geno_error_offset}" - def genoerror_cof = "--geno-error-coeff ${geno_error_coeff}" - def r2info = "--r2-info ${r2_info}" - def minmac = "--min-mac ${min_mac}" - def mincallrate = "--min-callrate ${min_callrate}" - def alpha_value = alpha.replaceAll(/,/, " --alpha ") - def doubletprior = "--doublet-prior ${doublet_prior}" - - """ - ${vcfref_sort} - mkdir demuxlet_${task.index} - touch demuxlet_${task.index}/params.csv - barcode_num=\$(wc -l < "${group_list}") - echo -e "Argument,Value \n samfile,${sam} \n tag_group,${tag_group} \n tag_UMI,${tag_UMI} \n vcf_ref,${vcfref_name} \n sm,${sm} \n sm_list_file,${sm_list_file_name} \n sam_verbose,${sam_verbose} \n vcf_verbose,${vcf_verbose} \n skip_umi,${skip_umi} \n cap_BQ,${cap_BQ} \n min_BQ,${min_BQ} \n min_MQ,${min_MQ} \n min_TD,${min_TD} \n excl_flag,${excl_flag} \n grouplist,${group_list} \n min_total,${min_total} \n min_uniq,${min_uniq} \n min_umi,${min_umi} \n min_snp,${min_snp} \n plpfile,${plp_name} \n vcf_donor,${vcf_donor} \n field,${field} \n geno_error_offset,${geno_error_offset} \n geno_error_coeff,${geno_error_coeff} \n r2_info,${r2_info} \n min_mac,${min_mac} \n min_callrate,${min_callrate} \n alpha,${alpha} \n doublet_prior,${doublet_prior}" >> demuxlet_${task.index}/params.csv - if [[ "$plp" != "True" ]] - then - popscle demuxlet $samfile $taggroup $tagUMI $vcfdonor $fieldinfo ${genoerror_off} ${genoerror_cof} $r2info $minmac \ - $mincallrate $smlist ${sm_list_file} --alpha ${alpha_value} $doubletprior $samverbose $vcfverbose $capBQ $minBQ \ - $minMQ $minTD $exclflag $grouplist $mintotal $minumi $minsnp --out demuxlet_${task.index}/${demuxlet_out} - else - mkdir demuxlet_${task.index}/plp - popscle dsc-pileup $samfile ${taggroup} ${tagUMI} $vcfref ${smlist} ${sm_list_file} ${samverbose} ${vcfverbose} \ - ${skipumi} ${capBQ} ${minBQ} ${minMQ} ${minTD} ${exclflag} ${grouplist} ${mintotal} ${minsnp} \ - --out demuxlet_${task.index}/plp/${demuxlet_out} - popscle demuxlet $taggroup $tagUMI --plp demuxlet_${task.index}/plp/${demuxlet_out} $vcfdonor $fieldinfo \ - ${genoerror_off} ${genoerror_cof} $r2info $minmac $mincallrate $smlist ${sm_list_file} --alpha ${alpha_value} \ - $doubletprior $samverbose $vcfverbose $capBQ $minBQ $minMQ $minTD $exclflag $grouplist $mintotal $minumi $minsnp \ - $minuniq --out demuxlet_${task.index}/${demuxlet_out} - - fi - """ - -} - -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() - } - else{ - Channel.from(input) - } -} - -workflow demultiplex_demuxlet{ - take: - sam - main: - group_list = split_input(params.barcodes) - tag_group = split_input(params.tag_group) - tag_UMI = split_input(params.tag_UMI) - sm = split_input(params.sm) - sm_list = split_input(params.sm_list) - sam_verbose = split_input(params.sam_verbose) - vcf_verbose = split_input(params.vcf_verbose) - skip_umi = split_input(params.skip_umi) - cap_BQ = split_input(params.cap_BQ) - min_BQ = split_input(params.min_BQ) - min_MQ = split_input(params.min_MQ) - min_TD = split_input(params.min_TD) - excl_flag = split_input(params.excl_flag) - min_total = split_input(params.min_total) - min_umi = split_input(params.min_umi) - min_uniq = split_input(params.min_uniq) - min_snp = split_input(params.min_snp) - plp = split_input(params.plp) - vcfdonor = split_input(params.vcf_donor) - field = split_input(params.field) - geno_error_offset = split_input(params.geno_error_offset) - geno_error_coeff = split_input(params.geno_error_coeff) - r2_info= split_input(params.r2_info) - min_mac = split_input(params.min_mac) - min_callrate = split_input(params.min_callrate) - alpha = split_input(params.alpha) - doublet_prior = split_input(params.doublet_prior) - demuxlet_out = params.demuxlet_out - - demuxlet(sam, tag_group, tag_UMI, sm, sm_list, sam_verbose, vcf_verbose, skip_umi, - cap_BQ, min_BQ, min_MQ, min_TD, excl_flag, group_list, min_total, min_uniq, min_umi, - min_snp, plp, vcfdonor, field, geno_error_offset, geno_error_coeff, r2_info, min_mac, - min_callrate, alpha, doublet_prior, demuxlet_out) - - emit: - demuxlet.out.collect() -} diff --git a/modules/single/gene_demulti/freebayes.nf b/modules/single/gene_demulti/freebayes.nf deleted file mode 100644 index 6a3fa26..0000000 --- a/modules/single/gene_demulti/freebayes.nf +++ /dev/null @@ -1,287 +0,0 @@ -#!/usr/bin/env nextflow -nextflow.enable.dsl=2 - -process freebayes{ - publishDir "$projectDir/$params.outdir/$params.mode/gene_demulti/freebayes", mode: 'copy' - label 'big_mem' - - conda "bioconda::freebayes=1.2" - - input: - path bam_freebayes - path bai_freebayes - val stdin_freebayes - val ref_freebayes - val ref_index_freebayes - val targets_freebayes - each region_freebayes - val samples_freebayes - val populations_freebayes - val cnv_map_freebayes - val vcf_freebayes - val gvcf_freebayes - val gvcf_chunk_freebayes - val gvcf_dont_use_chunk_freebayes - val variant_input_freebayes - val only_use_input_alleles_freebayes - val haplotype_basis_alleles_freebayes - val report_all_haplotype_alleles_freebayes - val report_monomorphic_freebayes - val pvar_freebayes - val strict_vcf_freebayes - val theta_freebayes - val ploidy_freebayes - val pooled_discrete_freebayes - val pooled_continuous_freebayes - val use_reference_allele_freebayes - val reference_quality_freebayes - val no_snps_freebayes - val no_indels_freebayes - val no_mnps_freebayes - val no_complex_freebayes - val use_best_n_alleles_freebayes - val haplotype_length_freebayes - val min_repeat_size_freebayes - val min_repeat_entropy_freebayes - val no_partial_observations_freebayes - val dont_left_align_indels_freebayes - val use_duplicate_reads_freebayes - val min_mapping_quality_freebayes - val min_base_quality_freebayes - val min_supporting_allele_qsum_freebayes - val min_supporting_mapping_qsum_freebayes - val mismatch_base_quality_threshold_freebayes - val read_mismatch_limit_freebayes - val read_max_mismatch_fraction_freebayes - val read_snp_limit_freebayes - val read_indel_limit_freebayes - val standard_filters_freebayes - val min_alternate_fraction_freebayes - val min_alternate_count_freebayes - val min_alternate_qsum_freebayes - val min_alternate_total_freebayes - val min_coverage_freebayes - val max_coverage_freebayes - val no_population_priors_freebayes - val hwe_priors_off_freebayes - val binomial_obs_priors_off_freebayes - val allele_balance_priors_off_freebayes - val observation_bias_freebayes - val base_quality_cap_freebayes - val prob_contamination_freebayes - val legacy_gls_freebayes - val contamination_estimates_freebayes - val report_genotype_likelihood_max_freebayes - val genotyping_max_iterations_freebayes - val genotyping_max_banddepth_freebayes - val posterior_integration_limits_freebayes - val exclude_unobserved_genotypes_freebayes - val genotype_variant_threshold_freebayes - val use_mapping_quality_freebayes - val harmonic_indel_quality_freebayes - val read_dependence_factor_freebayes - val genotype_qualities_freebayes - val debug_freebayes - val dd_freebayes - - - output: - path "*.vcf" - - script: - def stdin = stdin_freebayes != 'False' ? "--stdin" : '' - def targets = targets_freebayes != 'None' ? "--targets ${targets_freebayes}" : '' - def region = region_freebayes != 'None' ? "--region ${region_freebayes}" : '' - def samples = samples_freebayes != 'None' ? "--samples ${samples_freebayes}" : '' - def populations = populations_freebayes != 'None' ? "--populations ${populations}":'' - def cnv_map = cnv_map_freebayes != 'None' ? "--cnv-map ${cnv_map_freebayes}" : '' - - def gvcf = gvcf_freebayes !='False' ? "--gvcf":'' - def gvcf_chunk = gvcf_chunk_freebayes !='None' ? "--gvcf-chunk ${gvcf_chunk_freebayes}":'' - def gvcf_dont_use_chunk = gvcf_dont_use_chunk_freebayes != 'None' ?"--gvcf-dont-use-chunk ${gvcf_dont_use_chunk_freebayes}":'' - def variant_input = variant_input_freebayes != 'None' ? "--variant-input ${variant_input_freebayes}" : '' - def only_use_input_alleles = only_use_input_alleles_freebayes != 'False' ? "--only-use-input-alleles" : '' - def haplotype_basis_alleles = haplotype_basis_alleles_freebayes != 'None' ? "--haplotype-basis-alleles ${haplotype_basis_alleles_freebayes}":'' - def report_all_haplotype_alleles = report_all_haplotype_alleles_freebayes !='False' ? "--report-all-haplotype-alleles":'' - def report_monomorphic = report_monomorphic_freebayes !='False' ? "--report-monomorphic":'' - def pvar = "--pvar ${pvar_freebayes}" - def strict_vcf = strict_vcf_freebayes!='False' ? "--strict-vcf":'' - def theta = "--theta ${theta_freebayes}" - def ploidy = "--ploidy ${ploidy_freebayes}" - def pooled_discrete = pooled_discrete_freebayes!='False' ? "--pooled-discrete":'' - def pooled_continuous = pooled_continuous_freebayes !='False' ? "--pooled-continuous":'' - - def use_reference_allele = use_reference_allele_freebayes != 'False' ? "--use-reference-allele" : '' - def reference_quality = "--reference-quality ${reference_quality_freebayes}" - - def no_snps = no_snps_freebayes !='False' ? "--no-snps" :'' - def no_indels = no_indels_freebayes !='False' ? "--no-indels" :'' - def no_mnps = no_mnps_freebayes !='False' ? "--no-mnps" :'' - def no_complex = no_complex_freebayes !='False' ? "--no-complex" :'' - - def use_best_n_alleles = "--use-best-n-alleles ${use_best_n_alleles_freebayes}" - def haplotype_length = haplotype_length_freebayes = "--haplotype-length ${haplotype_length_freebayes}" - def min_repeat_size = "--min-repeat-size ${min_repeat_size_freebayes}" - def min_repeat_entropy = "--min-repeat-entropy ${min_repeat_entropy_freebayes}" - def no_partial_observations = no_partial_observations_freebayes !='False' ? "--no-partial-observations":'' - - def dont_left_align_indels = dont_left_align_indels_freebayes != 'False' ? "--dont-left-align-indels" : '' - - def use_duplicate_reads = use_duplicate_reads_freebayes != 'False' ? "--use-duplicate-reads" : '' - def min_mapping_quality = "--min-mapping-quality ${min_mapping_quality_freebayes}" - def min_base_quality = "--min-base-quality ${min_base_quality_freebayes}" - def min_supporting_allele_qsum = "--min-supporting-allele-qsum ${min_supporting_allele_qsum_freebayes}" - def min_supporting_mapping_qsum = "--min-supporting-mapping-qsum ${min_supporting_mapping_qsum_freebayes}" - def mismatch_base_quality_threshold = "--mismatch-base-quality-threshold ${mismatch_base_quality_threshold_freebayes}" - def read_mismatch_limit = read_mismatch_limit_freebayes != 'None' ? "-read-mismatch-limit ${read_mismatch_limit_freebayes}":'' - def read_max_mismatch_fraction = "--read-max-mismatch-fraction ${read_max_mismatch_fraction_freebayes}" - def read_snp_limit = read_snp_limit_freebayes != 'None' ? "--read-snp-limit ${read_snp_limit_freebayes}":'' - def read_indel_limit = read_indel_limit_freebayes != 'None' ? "--read-indel-limit ${read_indel_limit_freebayes}" :'' - def standard_filters = standard_filters_freebayes!='False' ? "--standard-filters":'' - def min_alternate_fraction = "--min-alternate-fraction ${min_alternate_fraction_freebayes}" - def min_alternate_count = "--min-alternate-count ${min_alternate_count_freebayes}" - def min_alternate_qsum = "--min-alternate-qsum ${min_alternate_qsum_freebayes}" - def min_alternate_total = "--min-alternate-total ${min_alternate_total_freebayes}" - def min_coverage = "--min-coverage ${min_coverage_freebayes}" - def max_coverage = max_coverage_freebayes !='None' ? "--max-coverage ${max_coverage_freebayes}":'' - - def no_population_priors = no_population_priors_freebayes != 'False' ? "--no-population-priors" : '' - def hwe_priors_off = hwe_priors_off_freebayes != 'False' ? "--hwe-priors-off" : '' - def binomial_obs_priors_off = binomial_obs_priors_off_freebayes != 'False' ? "--binomial-obs-priors-off" : '' - def allele_balance_priors_off = allele_balance_priors_off_freebayes != 'False' ? "--allele-balance-priors-off" : '' - def observation_bias = observation_bias_freebayes!= 'None' ? "--observation-bias ${observation_bias_freebayes}":'' - def base_quality_cap = base_quality_cap_freebayes !='None' ? "--base-quality-cap ${base_quality_cap_freebayes}":'' - def prob_contamination = "--prob-contamination ${prob_contamination_freebayes}" - def legacy_gls = legacy_gls_freebayes !='False' ? "--legacy-gls" :'' - def contamination_estimates = contamination_estimates_freebayes != 'None' ? "--contamination-estimates ${contamination_estimates_freebayes}":'' - - def report_genotype_likelihood_max = report_genotype_likelihood_max_freebayes !='False' ? "--report-genotype-likelihood-max":'' - def genotyping_max_iterations = "--genotyping-max-iterations ${genotyping_max_iterations_freebayes}" - def genotyping_max_banddepth = "--genotyping-max-banddepth ${genotyping_max_banddepth_freebayes}" - def posterior_integration_limits = "--posterior-integration-limits ${posterior_integration_limits_freebayes}" - def exclude_unobserved_genotypes = exclude_unobserved_genotypes_freebayes != 'False' ? "--exclude-unobserved-genotypes" : '' - def genotype_variant_threshold = genotype_variant_threshold_freebayes != 'None' ? "--genotype-variant-threshold ${genotype_variant_threshold_freebayes}":'' - def use_mapping_quality = use_mapping_quality_freebayes != 'False' ? "--use-mapping-quality" : '' - def harmonic_indel_quality = harmonic_indel_quality_freebayes !='False' ? "--harmonic-indel-quality" :'' - def read_dependence_factor = "--read-dependence-factor ${read_dependence_factor_freebayes}" - def genotype_qualities = genotype_qualities_freebayes !='False' ? "--genotype-qualities" :'' - - def debug = debug_freebayes != 'False' ? "--debug" : '' - def dd = dd_freebayes != 'False' ? "-dd" : '' - - """ - - freebayes ${bam_freebayes} $stdin -f ${ref_freebayes} $targets $region $samples $populations ${cnv_map} \ - -v ${region_freebayes}_${vcf_freebayes} $gvcf ${gvcf_chunk} ${gvcf_dont_use_chunk} ${variant_input} ${only_use_input_alleles} ${haplotype_basis_alleles} ${report_all_haplotype_alleles} ${report_monomorphic} $pvar ${strict_vcf} \ - $theta $ploidy ${pooled_discrete} ${pooled_continuous} \ - ${use_reference_allele} ${reference_quality} ${no_snps} ${no_indels} ${no_mnps} ${no_complex} ${use_best_n_alleles} ${haplotype_length} ${min_repeat_size} ${min_repeat_entropy} ${no_partial_observations} ${dont_left_align_indels} \ - ${use_duplicate_reads} ${min_mapping_quality} ${min_base_quality} ${min_supporting_allele_qsum} ${min_supporting_mapping_qsum} ${mismatch_base_quality_threshold} \ - ${read_mismatch_limit} ${read_max_mismatch_fraction} ${read_snp_limit} ${read_indel_limit} ${standard_filters} ${min_alternate_fraction} ${min_alternate_count} ${min_alternate_qsum} ${min_alternate_total} ${min_coverage} ${max_coverage} \ - ${no_population_priors} ${hwe_priors_off} ${binomial_obs_priors_off} ${allele_balance_priors_off} \ - ${observation_bias} ${base_quality_cap} ${prob_contamination} ${legacy_gls} ${contamination_estimates} \ - ${report_genotype_likelihood_max} ${genotyping_max_iterations} ${genotyping_max_banddepth} ${posterior_integration_limits} ${exclude_unobserved_genotypes} ${genotype_variant_threshold} \ - ${use_mapping_quality} ${harmonic_indel_quality} ${read_dependence_factor} ${genotype_qualities} $debug $dd - - """ -} - - -workflow variant_freebayes{ - take: - bam_freebayes - bai_freebayes - region_freebayes - main: - stdin_freebayes = Channel.value(params.stdin) - fasta_reference = Channel.value(params.fasta) - fasta_reference_index = Channel.value(params.fasta_index) - targets_freebayes = Channel.value(params.targets) - samples_freebayes = Channel.value(params.samples) - populations_freebayes = channel.value(params.populations) - cnv_map_freebayes = Channel.value(params.cnv_map) - vcf_freebayes = channel.value(params.vcf_freebayes) - gvcf_freebayes = channel.value(params.gvcf) - gvcf_chunk_freebayes = channel.value(params.gvcf_chunk) - gvcf_dont_use_chunk_freebayes = channel.value(params.gvcf_dont_use_chunk) - variant_input_freebayes = Channel.value(params.variant_input) - only_use_input_alleles_freebayes = Channel.value(params.only_use_input_alleles) - haplotype_basis_alleles_freebayes = channel.value(params.haplotype_basis_alleles) - report_all_haplotype_alleles_freebayes = channel.value(params.report_all_haplotype_alleles) - report_monomorphic_freebayes = channel.value(params.report_monomorphic) - pvar_freebayes = channel.value(params.pvar) - strict_vcf_freebayes = channel.value(params.strict_vcf) - - theta_freebayes = channel.value(params.theta) - ploidy_freebayes = channel.value(params.ploidy) - pooled_discrete_freebayes = channel.value(params.pooled_discrete) - pooled_continuous_freebayes = channel.value(params.pooled_continuous) - use_reference_allele_freebayes = channel.value(params.use_reference_allele) - reference_quality_freebayes = channel.value(params.reference_quality) - no_snps = channel.value(params.no_snps) - no_indels = channel.value(params.no_indels) - no_mnps = channel.value(params.no_mnps) - no_complex = channel.value(params.no_complex) - use_best_n_alleles_freebayes = channel.value(params.use_best_n_alleles) - haplotype_length_freebayes = channel.value(params.haplotype_length) - min_repeat_size_freebayes = channel.value(params.min_repeat_size) - min_repeat_entropy_freebayes = channel.value(params.min_repeat_entropy) - no_partial_observations_freebayes = channel.value(params.no_partial_observations) - - dont_left_align_indels_freebayes = channel.value(params.dont_left_align_indels) - use_duplicate_reads_freebayes = channel.value(params.use_duplicate_reads) - min_mapping_quality_freebayes = channel.value(params.min_mapping_quality) - min_base_quality_freebayes = channel.value(params.min_base_quality) - min_supporting_allele_qsum_freebayes = channel.value(params.min_supporting_allele_qsum) - min_supporting_mapping_qsum_freebayes = channel.value(params.min_supporting_mapping_qsum) - mismatch_base_quality_threshold_freebayes = channel.value(params.mismatch_base_quality_threshold) - read_mismatch_limit_freebayes = channel.value(params.read_mismatch_limit) - read_max_mismatch_fraction_freebayes = channel.value(params.read_max_mismatch_fraction) - read_snp_limit_freebayes = channel.value(params.read_snp_limit) - read_indel_limit_freebayes = channel.value(params.read_indel_limit) - standard_filters_freebayes = channel.value(params.standard_filters) - min_alternate_fraction_freebayes = channel.value(params.min_alternate_fraction) - min_alternate_count_freebayes = channel.value(params.min_alternate_count) - min_alternate_qsum_freebayes = channel.value(params.min_alternate_qsum) - min_alternate_total_freebayes = channel.value(params.min_alternate_total) - min_coverage_freebayes = channel.value(params.min_coverage) - max_coverage_freebayes = channel.value(params.max_coverage) - no_population_priors_freebayes = channel.value(params.no_population_priors) - - hwe_priors_off_freebayes = channel.value(params.hwe_priors_off) - binomial_obs_priors_off_freebayes = channel.value(params.binomial_obs_priors_off) - allele_balance_priors_off_freebayes = channel.value(params.allele_balance_priors_off) - observation_bias_freebayes = channel.value(params.observation_bias) - base_quality_cap_freebayes = channel.value(params.base_quality_cap) - prob_contamination_freebayes = channel.value(params.prob_contamination) - legacy_gls_freebayes = channel.value(params.legacy_gls) - contamination_estimates_freebayes = channel.value(params.contamination_estimates) - - report_genotype_likelihood_max_freebayes = channel.value(params.report_genotype_likelihood_max) - genotyping_max_iterations_freebayes = channel.value(params.genotyping_max_iterations) - genotyping_max_banddepth_freebayes = channel.value(params.genotyping_max_banddepth) - posterior_integration_limits_freebayes = channel.value(params.posterior_integration_limits) - exclude_unobserved_genotypes_freebayes = channel.value(params.exclude_unobserved_genotypes) - genotype_variant_threshold_freebayes = channel.value(params.genotype_variant_threshold) - use_mapping_quality_freebayes = channel.value(params.use_mapping_quality) - harmonic_indel_quality_freebayes = channel.value(params.harmonic_indel_quality) - read_dependence_factor_freebayes = channel.value(params.read_dependence_factor) - genotype_qualities_freebayes = channel.value(params.genotype_qualities) - debug_freebayes = channel.value(params.debug) - dd_freebayes = channel.value(params.dd) - - - freebayes(bam_freebayes, bai_freebayes, stdin_freebayes, fasta_reference, fasta_reference_index, targets_freebayes, region_freebayes, samples_freebayes, populations_freebayes, cnv_map_freebayes, \ - vcf_freebayes, gvcf_freebayes, gvcf_chunk_freebayes, gvcf_dont_use_chunk_freebayes, variant_input_freebayes, only_use_input_alleles_freebayes, haplotype_basis_alleles_freebayes, report_all_haplotype_alleles_freebayes, report_monomorphic_freebayes, pvar_freebayes, strict_vcf_freebayes, \ - theta_freebayes, ploidy_freebayes, pooled_discrete_freebayes, pooled_continuous_freebayes, use_reference_allele_freebayes, reference_quality_freebayes, \ - no_snps, no_indels, no_mnps, no_complex, use_best_n_alleles_freebayes, haplotype_length_freebayes, min_repeat_size_freebayes, min_repeat_entropy_freebayes, no_partial_observations_freebayes, \ - dont_left_align_indels_freebayes, use_duplicate_reads_freebayes, min_mapping_quality_freebayes, min_base_quality_freebayes, min_supporting_allele_qsum_freebayes,\ - min_supporting_mapping_qsum_freebayes, mismatch_base_quality_threshold_freebayes, read_mismatch_limit_freebayes, read_max_mismatch_fraction_freebayes, - read_snp_limit_freebayes, read_indel_limit_freebayes, standard_filters_freebayes, min_alternate_fraction_freebayes, min_alternate_count_freebayes, min_alternate_qsum_freebayes, min_alternate_total_freebayes, min_coverage_freebayes, max_coverage_freebayes, \ - no_population_priors_freebayes, hwe_priors_off_freebayes, binomial_obs_priors_off_freebayes, allele_balance_priors_off_freebayes, observation_bias_freebayes, base_quality_cap_freebayes, prob_contamination_freebayes, legacy_gls_freebayes, contamination_estimates_freebayes, \ - report_genotype_likelihood_max_freebayes, genotyping_max_iterations_freebayes, genotyping_max_banddepth_freebayes, posterior_integration_limits_freebayes, exclude_unobserved_genotypes_freebayes, \ - genotype_variant_threshold_freebayes, use_mapping_quality_freebayes, harmonic_indel_quality_freebayes, read_dependence_factor_freebayes, genotype_qualities_freebayes, debug_freebayes, dd_freebayes) - - emit: - freebayes.out.collect() - -} diff --git a/modules/single/gene_demulti/freemuxlet.nf b/modules/single/gene_demulti/freemuxlet.nf deleted file mode 100755 index bc22768..0000000 --- a/modules/single/gene_demulti/freemuxlet.nf +++ /dev/null @@ -1,148 +0,0 @@ -#!/usr/bin/env nextflow -nextflow.enable.dsl=2 - -process freemuxlet { - publishDir "$projectDir/$params.outdir/$params.mode/gene_demulti/freemuxlet", mode: 'copy' - label 'small_mem' - - conda "bioconda::popscle bioconda::samtools bioconda::bedtools bioconda::bcftools=1.9" - - input: - each sam - path(barcodes) - each vcf - each tag_group - each tag_UMI - each sm - each sm_list - each sam_verbose - each vcf_verbose - each skip_umi - each cap_BQ - each min_BQ - each min_MQ - each min_TD - each excl_flag - each group_list - each min_total - each min_uniq - each min_umi - each min_snp - each init_cluster - each nsample - each aux_files - each verbose - each doublet_prior - each bf_thres - each frac_init_clust - each iter_init - each keep_init_missing - each freemuxlet_out - - output: - path "freemuxlet_${task.index}" - - script: - def samfile = "--sam filtered_bam_file.bam" - def taggroup = tag_group != 'None' ? "--tag-group ${tag_group}" : '' - def tagUMI = tag_UMI != 'None' ? "--tag-UMI ${tag_UMI}" : '' - def vcffile = "--vcf samples.sorted_as_in_bam.vcf" - def smlist = sm != 'None' ? "--sm $sm" : '' - def sm_list_file = sm_list != 'None' ? "--sm-list ${sm_list}" : '' - def sm_list_file_name = sm_list != 'None' ? file(sm_list).baseName : "No sm list file is given" - def samverbose = "--sam-verbose ${sam_verbose}" - def vcfverbose = "--vcf-verbose ${vcf_verbose}" - def skipumi = skip_umi != "False" ? "--skip-umi" : "" - def capBQ = "--cap-BQ ${cap_BQ}" - def minBQ = "--min-BQ ${min_BQ}" - def minMQ = "--min-MQ ${min_MQ}" - def minTD = "--min-TD ${min_TD}" - def exclflag = "--excl-flag ${excl_flag}" - def grouplist = "--group-list ${group_list}" - def mintotal = "--min-total ${min_total}" - def minuniq = "--min-uniq ${min_uniq}" - def minumi = "--min-umi ${min_umi}" - def minsnp = "--min-snp ${min_snp}" - def initcluster = init_cluster != 'None' ? "--init-cluster ${init_cluster}" : '' - def n_sample = "--nsample $nsample" - def auxfiles = aux_files != 'False' ? "--aux-files" : '' - def verbose_info = "--verbose $verbose" - def doubletprior = "--doublet-prior ${doublet_prior}" - def bfthres = "--bf-thres ${bf_thres}" - def frac_init_cluster = "--frac-init-clust ${frac_init_clust}" - def iterinit = "--iter-init ${iter_init}" - def keepinit_missing = keep_init_missing != "False" ? "--keep-init-missing" : '' - - """ - echo 'test5' - filter_bam_file_for_popscle_dsc_pileup.sh ${sam} ${barcodes} ${vcf} filtered_bam_file.bam - sort_vcf_same_as_bam.sh filtered_bam_file.bam ${vcf} > samples.sorted_as_in_bam.vcf - - mkdir freemuxlet_${task.index} - mkdir freemuxlet_${task.index}/plp - touch freemuxlet_${task.index}/params.csv - echo -e "Argument,Value \n samfile,filtered_bam_file.bam \n tag_group,${tag_group} \n tag_UMI,${tag_UMI} \n vcf_file,samples.sorted_as_in_bam.vcf \n sm,${sm} \n sm_list_file,${sm_list_file_name} \n sam_verbose,${sam_verbose} \n vcf_verbose,${vcf_verbose} \n skip_umi,${skip_umi} \n cap_BQ,${cap_BQ} \n min_BQ,${min_BQ} \n min_MQ,${min_MQ} \n min_TD,${min_TD} \n excl_flag,${excl_flag} \n grouplist,${group_list} \n min_total,${min_total} \n min_uniq,${min_uniq} \n min_umi,${min_umi} \n min_snp,${min_snp} \n init_cluster,${init_cluster} \n nsample,${nsample} \n aux_files,${aux_files} \n verbose,${verbose} \n doublet_prior,${doublet_prior} \n bf_thres,${bf_thres} \n frac_init_clust,${frac_init_clust} \n iter_init,${iter_init} \n keep_init_missing,${keep_init_missing}" >> freemuxlet_${task.index}/params.csv - - popscle dsc-pileup $samfile ${taggroup} ${tagUMI} $vcffile ${smlist} ${sm_list_file} ${samverbose} \ - ${vcfverbose} ${skipumi} ${capBQ} ${minBQ} ${minMQ} ${minTD} ${exclflag} ${grouplist} ${mintotal} ${minuniq} \ - ${minsnp} --out freemuxlet_${task.index}/plp/${freemuxlet_out} - popscle freemuxlet --plp freemuxlet_${task.index}/plp/${freemuxlet_out} --out freemuxlet_${task.index}/${freemuxlet_out} \ - ${initcluster} ${n_sample} ${auxfiles} ${verbose_info} ${doubletprior} ${bfthres} ${frac_init_cluster} ${iterinit} \ - ${keepinit_missing} ${capBQ} ${minBQ} ${grouplist} ${mintotal} ${minumi} ${minsnp} - - """ - -} - - -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{return it.tokenize(';')}.flatten() - } - else{ - Channel.from(input) - } -} - - -workflow demultiplex_freemuxlet{ - take: - sam - main: - group_list = split_input(params.barcodes) - vcf = split_input(params.common_variants_freemuxlet) - tag_group = split_input(params.tag_group) - tag_UMI = split_input(params.tag_UMI) - sm = split_input(params.sm) - sm_list = split_input(params.sm_list) - sam_verbose = split_input(params.sam_verbose) - vcf_verbose = split_input(params.vcf_verbose) - skip_umi = split_input(params.skip_umi) - - cap_BQ = split_input(params.cap_BQ) - min_BQ = split_input(params.min_BQ) - min_MQ = split_input(params.min_MQ) - min_TD = split_input(params.min_TD) - excl_flag = split_input(params.excl_flag) - min_total = split_input(params.min_total) - min_uniq = split_input(params.min_uniq) - min_umi = split_input(params.min_umi) - min_snp = split_input(params.min_snp) - init_cluster = split_input(params.init_cluster) - nsample = split_input(params.nsample) - aux_files = split_input(params.aux_files) - verbose = split_input(params.verbose) - doublet_prior = split_input(params.doublet_prior) - bf_thres = split_input(params.bf_thres) - frac_init_clust = split_input(params.frac_init_clust) - iter_init = split_input(params.iter_init) - keep_init_missing = split_input(params.keep_init_missing) - freemuxlet_out = params.freemuxlet_out - - freemuxlet(sam, params.barcodes, vcf, tag_group, tag_UMI, sm, sm_list, sam_verbose, vcf_verbose, skip_umi, cap_BQ, - min_BQ, min_MQ, min_TD, excl_flag, group_list, min_total, min_uniq, min_umi, min_snp, init_cluster, nsample, - aux_files, verbose, doublet_prior, bf_thres, frac_init_clust, iter_init, keep_init_missing, freemuxlet_out) - - emit: - freemuxlet.out.collect() -} diff --git a/modules/single/gene_demulti/samtools.nf b/modules/single/gene_demulti/samtools.nf deleted file mode 100644 index b6c0177..0000000 --- a/modules/single/gene_demulti/samtools.nf +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env nextflow -nextflow.enable.dsl=2 - -process samtools{ - publishDir "$projectDir/$params.outdir/$params.mode/gene_demulti/samtools", mode: 'copy' - label 'big_mem' - - conda "bioconda::samtools bioconda::umi_tools" - - input: - file bam - - output: - path "samtools_${task.index}" - - - script: - """ - mkdir samtools_${task.index} - samtools view -S -b -q 10 -F 3844 $bam > samtools_${task.index}/filtered.bam - samtools index samtools_${task.index}/filtered.bam samtools_${task.index}/filtered.bam.bai - umi_tools dedup --stdin=samtools_${task.index}/filtered.bam --extract-umi-method=tag --umi-tag=UR --cell-tag=CB --log=logfile > samtools_${task.index}/no_dup.bam - samtools sort samtools_${task.index}/no_dup.bam -o samtools_${task.index}/sorted.bam - samtools index samtools_${task.index}/sorted.bam samtools_${task.index}/sorted.bam.bai - """ -} - - -workflow data_preprocess{ - take: - bam - main: - samtools(bam) - emit: - samtools.out - -} - diff --git a/modules/single/gene_demulti/scsplit.nf b/modules/single/gene_demulti/scsplit.nf deleted file mode 100644 index e1e5df3..0000000 --- a/modules/single/gene_demulti/scsplit.nf +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env nextflow -nextflow.enable.dsl=2 - -process scSplit{ - publishDir "$projectDir/$params.outdir/$params.mode/gene_demulti/scSplit", mode: 'copy' - label 'big_mem' - - conda "$projectDir/conda/scsplit.yml" - - input: - each vcf - each bam - each bai - each barcode - each tag - each com - each ref - each alt - each num - each sub - each ems - each dbl - each vcf_known - each sample_geno - each scsplit_out - - - output: - path "scsplit_${task.index}" - - script: - - def common_data = com != 'None' ? "--com $com" : '' - def common_data_name = com != 'None' ? com : 'Common variants are not given.' - def vcf_known_data = vcf_known != 'None' ? "--vcf ${vcf_known}" : '' - def vcf_known_data_name = vcf_known != 'None' ? vcf_known : 'Known variants are not given.' - def sub_yesno = num == 0 ? "$sub": 'no_sub' - - def vcf_data = "-v $vcf" - def bam_data = "-i $bam" - def barcode_data = "-b $barcode" - def tag_data = "--tag $tag" - def num_data = "-n $num" - def sub_data = num == 0 ? "--sub $sub": '' - def ems_data = "--ems $ems" - def dbl_data = dbl != 'None' ? "--dbl $dbl" : '' - def out = "scsplit_${task.index}/${scsplit_out}" - - """ - git clone https://github.com/jon-xu/scSplit - mkdir scsplit_${task.index} - mkdir $out - touch scsplit_${task.index}/params.csv - echo -e "Argument,Value \n vcf,$vcf \n bam,$bam \n barcode,$barcode \n common_data,${common_data_name} \n num,${num} \n sub,${sub_yesno} \n ems,${ems} \n dbl,${dbl} \n vcf_known_data,${vcf_known_data_name}" >> scsplit_${task.index}/params.csv - - python scSplit/scSplit count ${vcf_data} ${bam_data} ${barcode_data} ${common_data} -r $ref -a $alt --out $out - python scSplit/scSplit run -r $out/$ref -a $out/$alt ${num_data} ${sub_data} ${ems_data} ${dbl_data} ${vcf_known_data} --out $out - - if [[ "$sample_geno" != "False" ]] - then - python scSplit/scSplit genotype -r $out/$ref -a $out/$alt -p $out/scSplit_P_s_c.csv --out $out - fi - """ -} - -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() - } - else{ - Channel.from(input) - } -} - -workflow demultiplex_scSplit{ - take: - bam_scsplit - vcf_scsplit - bai_scsplit - main: - tag_scsplit = split_input(params.tag_group) - bar_scsplit = split_input(params.barcodes) - com_scsplit = split_input(params.common_variants_scSplit) - ref_scsplit = split_input(params.refscSplit) - alt_scsplit = split_input(params.altscSplit) - num_scsplit = split_input(params.nsample) - sub_scsplit = split_input(params.subscSplit) - ems_scsplit = split_input(params.emsscSplit) - dbl_scsplit = split_input(params.dblscSplit) - vcf_known_scsplit = split_input(params.vcf_donor) - sample_geno = split_input(params.sample_geno) - scsplit_out = params.scsplit_out - scSplit(vcf_scsplit, bam_scsplit, bai_scsplit, bar_scsplit, tag_scsplit, com_scsplit, ref_scsplit, - alt_scsplit, num_scsplit, sub_scsplit, ems_scsplit, dbl_scsplit, vcf_known_scsplit,sample_geno, scsplit_out) - emit: - scSplit.out.collect() - -} - diff --git a/modules/single/gene_demulti/souporcell.nf b/modules/single/gene_demulti/souporcell.nf deleted file mode 100755 index 2ad7fb9..0000000 --- a/modules/single/gene_demulti/souporcell.nf +++ /dev/null @@ -1,106 +0,0 @@ -#!/usr/bin/env nextflow -nextflow.enable.dsl=2 - -process souporcell{ - publishDir "$projectDir/$params.outdir/$params.mode/gene_demulti/souporcell", mode: 'copy' - label 'big_mem' - - container "shub://wheaton5/souporcell" - - input: - each bam - each barcodes - each fasta - each threads - each clusters - each ploidy - each min_alt - each min_ref - each max_loci - each restarts - each common_variants - each use_known_genotype - each known_genotypes - each known_genotypes_sample_names - each skip_remap - each ignore - each souporcell_out - - output: - path "souporcell_${task.index}" - - script: - def bamfile = "-i $bam" - def barcode = "-b $barcodes" - def fastafile = "-f $fasta" - def thread = "-t $threads" - def cluster = "-k $clusters" - def ploi = "--ploidy $ploidy" - def minalt = "--min_alt ${min_alt}" - def minref = "--min_ref ${min_ref}" - def maxloci = "--max_loci ${max_loci}" - def restart = restarts != 'None' ? "--restarts $restarts" : '' - - def commonvariant = (common_variants != 'None' & use_known_genotype != "True" & known_genotypes == 'None' )? "--common_variants ${common_variants}" : '' - def commonvariant_name = (common_variants != 'None' & use_known_genotype != "True" & known_genotypes == 'None' ) ? file(common_variants).baseName : 'no_common_variants' - - def knowngenotype = (use_known_genotype == "True" & known_genotypes != 'None') ? "--known_genotypes ${known_genotypes}" : '' - def knowngenotype_name = (use_known_genotype == "True" & known_genotypes != 'None') ? file(known_genotypes).baseName : 'no_known_genotypes' - - def knowngenotypes_sample = known_genotypes_sample_names != 'None' ? "--known_genotypes_sample_names ${known_genotypes_sample_names}" : '' - def knowngenotype_sample_name = known_genotypes_sample_names != 'None' ? file(known_genotypes_sample_names).baseName : 'no_knowngenotypes_sample_names' - - def skipremap = skip_remap != 'False' ? "--skip_remap True" : '' - def ign = ignore != 'False' ? "--ignore True" : '' - def out = "souporcell_${task.index}/${souporcell_out}" - - """ - mkdir souporcell_${task.index} - mkdir $out - touch souporcell_${task.index}/params.csv - echo -e "Argument,Value \n bamfile,${bam} \n barcode,${barcodes} \n fasta,${fasta} \n threads,${threads} \n clusters,${clusters} \n ploidy,${ploidy} \n min_alt,${min_alt} \n min_ref,${min_ref} \n max_loci,${max_loci} \n restarts,${restarts} \n common_variant,${commonvariant_name} \n known_genotype,${knowngenotype_name} \n known_genotype_sample,${knowngenotype_sample_name} \n skip_remap,${skip_remap} \n ignore,${ignore} " >> souporcell_${task.index}/params.csv - souporcell_pipeline.py --threads ${task.cpus} $bamfile $barcode $fastafile $thread $cluster $ploi $minalt $minref $maxloci $restart $commonvariant $knowngenotype $knowngenotypes_sample $skipremap $ign -o $out - """ -} - - -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() - } - else{ - Channel.from(input) - } -} - - -workflow demultiplex_souporcell{ - take: - bam - main: - barcodes = split_input(params.barcodes) - fasta = split_input(params.fasta) - threads = split_input(params.threads) - clusters = split_input(params.nsample) - - ploidy = split_input(params.ploidy) - min_alt = split_input(params.min_alt) - min_ref = split_input(params.min_ref) - max_loci = split_input(params.max_loci) - restarts = split_input(params.restarts) - - common_variants = split_input(params.common_variants_souporcell) - use_known_genotype = split_input(params.use_known_genotype) - known_genotypes = split_input(params.vcf_donor) - known_genotypes_sample_names = split_input(params.known_genotypes_sample_names) - skip_remap = split_input(params.skip_remap) - ignore = split_input(params.ignore) - souporcell_out = params.souporcell_out - - souporcell(bam, barcodes, fasta, threads, clusters, ploidy, min_alt, min_ref, max_loci, restarts, - common_variants, use_known_genotype, known_genotypes, known_genotypes_sample_names, skip_remap, - ignore, souporcell_out) - - emit: - souporcell.out.collect() -} diff --git a/modules/single/gene_demulti/vireo.nf b/modules/single/gene_demulti/vireo.nf deleted file mode 100755 index fbe383b..0000000 --- a/modules/single/gene_demulti/vireo.nf +++ /dev/null @@ -1,116 +0,0 @@ -#!/usr/bin/env nextflow -nextflow.enable.dsl=2 - -process vireo{ - publishDir "$projectDir/$params.outdir/$params.mode/gene_demulti/vireo", mode: 'copy' - label 'big_mem' - - conda "aksarkar::vireosnp" - - input: - each celldata - each ndonor - each donorfile - each genoTag - each noDoublet - each nInit - each extraDonor - each extraDonorMode - each forceLearnGT - each ASEmode - each noPlot - each randSeed - each cellRange - each callAmbientRNAs - each nproc - each findVariant - each vireo_out - - - output: - path "vireo_${task.index}" - - - script: - def cell_data = "-c $celldata" - def n_donor = ndonor != 'None'? "-N $ndonor" : '' - def n_donor_yesno = ndonor != 'None'? "$ndonor" : "Number of donors are not given" - def donor = donorfile != 'None' ? "-d no_prefix.vcf" : '' - def donor_no_chr_cmd = donorfile != 'None' ? "zcat $donorfile | awk '{gsub(/^chr/,\"\"); print}' | awk '{gsub(/ID=chr/,\"ID=\"); print}' > no_prefix.vcf" : '' - def donor_data_name = donorfile != 'None' ? donorfile : 'Donor file is not given' - def geno_tag = donorfile != 'None' ? "--genoTag $genoTag" : '' - def no_doublet = noDoublet != 'False' ? "--noDoublet" : '' - def n_init = "--nInit $nInit" - def extra_donor = "--extraDonor $extraDonor" - def extradonor_mode = extraDonorMode != 'distance' ? "--extraDonorMode $extraDonorMode" : '' - def learnGT = (forceLearnGT != 'False' && donorfile != 'None')? "--forceLearnGT" : '' - def learnGT_yesno = (forceLearnGT != 'False' && donorfile != 'None')? "$forceLearnGT" : 'False' - def ase_mode = ASEmode != 'False' ? "--ASEmode" : '' - def no_plot = noPlot != 'False' ? "--noPlot" : '' - def random_seed = randSeed != 'None'? "--randSeed $randSeed" : '' - def cell_range = cellRange != 'all'? "--cellRange $cellRange" : '' - def call_ambient_rna = callAmbientRNAs != 'False' ? "--callAmbientRNAs" : '' - def n_proc = "--nproc $nproc" - - """ - mkdir vireo_${task.index} - mkdir vireo_${task.index}/${vireo_out} - touch vireo_${task.index}/params.csv - - ${donor_no_chr_cmd} - echo -e "Argument,Value \n cell_data,${celldata} \n n_donor,${n_donor_yesno} \n donor_data,${donor_data_name} \n genoTag,${genoTag} \n noDoublet,${noDoublet} \n nInit,${nInit} \n extraDonor,${extraDonor} \n extraDonorMode,${extraDonorMode} \n learnGT,${learnGT_yesno} \n ASEmode,${ASEmode} \n noPlot,${noPlot} \n randSeed,${randSeed} \n cellRange,${cellRange} \n callAmbientRNAs,${callAmbientRNAs} \n nproc,${nproc}" >> vireo_${task.index}/params.csv - - vireo ${cell_data} ${n_donor} $donor ${geno_tag} ${no_doublet} ${n_init} ${extra_donor} ${extradonor_mode} \ - $learnGT ${ase_mode} ${no_plot} ${random_seed} ${cell_range} ${call_ambient_rna} ${n_proc} \ - -o vireo_${task.index}/${vireo_out} - if ([ "$donorfile" = "None" ]); then - if ([ "$findVariant" = "True" ] || [ "$findVariant" = "vireo" ]); then - GTbarcode -i vireo_${task.index}/${vireo_out}/GT_donors.vireo.vcf.gz -o vireo_${task.index}/${vireo_out}/filtered_variants.tsv ${randSeed} - fi - fi - - """ - -} - - -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() - } - else{ - Channel.from(input) - } -} - -workflow demultiplex_vireo{ - take: - celldata - - main: - ndonor = split_input(params.nsample) - donorfile = split_input(params.vcf_donor) - genoTag = split_input(params.genoTag) - - noDoublet = split_input(params.noDoublet) - nInit = split_input(params.nInit) - extraDonor = split_input(params.extraDonor) - extraDonorMode = split_input(params.extraDonorMode) - forceLearnGT = split_input(params.forceLearnGT) - - ASEmode = split_input(params.ASEmode) - noPlot = split_input(params.noPlot) - randSeed = split_input(params.randSeed) - cellRange = split_input(params.cellRange) - callAmbientRNAs = split_input(params.callAmbientRNAs) - nproc = split_input(params.nproc) - findVariant = split_input(params.findVariants) - vireo_out = params.vireo_out - - vireo(celldata, ndonor, donorfile, genoTag, noDoublet, nInit, extraDonor, extraDonorMode, forceLearnGT, - ASEmode, noPlot, randSeed, cellRange, callAmbientRNAs, nproc, findVariant, vireo_out) - - - emit: - vireo.out.collect() -} diff --git a/modules/single/gene_demultiplexing.nf b/modules/single/gene_demultiplexing.nf deleted file mode 100644 index bb6b921..0000000 --- a/modules/single/gene_demultiplexing.nf +++ /dev/null @@ -1,194 +0,0 @@ -#!/usr/bin/env nextflow -nextflow.enable.dsl=2 - -include { data_preprocess } from './gene_demulti/samtools' -include { filter_variant } from './gene_demulti/bcftools' -include { variant_cellSNP } from './gene_demulti/cellsnp' -include { variant_freebayes } from './gene_demulti/freebayes' -include { demultiplex_demuxlet } from './gene_demulti/demuxlet' -include { demultiplex_freemuxlet } from './gene_demulti/freemuxlet' -include { demultiplex_scSplit } from './gene_demulti/scsplit' -include { demultiplex_souporcell } from './gene_demulti/souporcell' -include { demultiplex_vireo } from './gene_demulti/vireo' - -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() - } - else{ - Channel.from(input) - } -} - -process summary{ - publishDir "$projectDir/$params.outdir/$params.mode/gene_demulti", mode: 'copy' - label 'small_mem' - - conda "pandas scanpy mudata" - - input: - val demuxlet_result - val freemuxlet_result - val vireo_result - val souporcell_result - val scsplit_result - val generate_anndata - val generate_mudata - val rna_matrix - val hto_matrix - output: - path genetic_summary - - script: - def demuxlet_files = "" - def freemuxlet_files = "" - def vireo_files = "" - def souporcell_files = "" - def scsplit_files = "" - def generate_adata = "" - def generate_mdata = "" - - if (demuxlet_result != "no_result"){ - demuxlet_files = "--demuxlet ${demuxlet_result.join(":")}" - } - if (freemuxlet_result != "no_result"){ - freemuxlet_files = "--freemuxlet ${freemuxlet_result.join(":")}" - } - if (vireo_result != "no_result"){ - vireo_files = "--vireo ${vireo_result.join(":")}" - } - if (souporcell_result != "no_result"){ - souporcell_files = "--souporcell ${souporcell_result.join(":")}" - } - if (scsplit_result != "no_result"){ - scsplit_files = "--scsplit ${scsplit_result.join(":")}" - } - if (scsplit_result != "no_result"){ - scsplit_files = "--scsplit ${scsplit_result.join(":")}" - } - if (generate_anndata == "True"){ - if(rna_matrix == "None"){ - error "Error: RNA count matrix is not given." - } - generate_adata = "--generate_anndata --read_rna_mtx $rna_matrix" - } - if (generate_mudata == "True"){ - if(rna_matrix == "None"){ - error "Error: RNA count matrix is not given." - } - if(hto_matrix == "None"){ - error "Error: HTO count matrix is not given." - } - generate_mdata = "--generate_mudata --read_rna_mtx $rna_matrix --read_hto_mtx $hto_matrix" - } - """ - summary_gene.py $demuxlet_files $vireo_files $souporcell_files $scsplit_files $freemuxlet_files $generate_adata $generate_mdata - """ -} - - -workflow gene_demultiplexing { - main: - input_bam = Channel.fromPath(params.bam) - input_bai = Channel.fromPath(params.bai) - - if ((params.demuxlet == "True" & params.demuxlet_preprocess != 'False')| \ - (params.freemuxlet == "True" & params.freemuxlet_preprocess != 'False')| \ - (params.scSplit == "True" & params.scSplit_preprocess != 'False') | \ - (params.vireo == "True" & params.vireo_preprocess != 'False') | \ - (params.souporcell == "True" & params.souporcell_preprocess != 'False')){ - data_preprocess(input_bam) - qc_bam = data_preprocess.out.map{ return it + "/sorted.bam"} - qc_bam_bai = data_preprocess.out.map{ return it + "/sorted.bam.bai"} - } - - if (params.vireo == "True" & params.vireo_variant == 'True'){ - if(params.vireo_preprocess != 'False'){ - variant_cellSNP(qc_bam, qc_bam_bai) - } - else{ - variant_cellSNP(input_bam, input_bai) - } - cellsnp_vcf = variant_cellSNP.out.map{ return it + "/*/cellSNP.cells.vcf"} - } - - if (params.scSplit == "True" & params.scSplit_variant == 'True' ){ - freebayes_region = Channel.from(1..22, "X","Y").flatten() - if (params.region != "None"){ - freebayes_region = split_input(params.region) - } - if(params.scSplit_preprocess != 'False'){ - variant_freebayes(qc_bam, qc_bam_bai, freebayes_region) - } - else{ - variant_freebayes(input_bam, input_bai, freebayes_region) - } - filter_variant(variant_freebayes.out) - freebayes_vcf = filter_variant.out.map{ return it + "/filtered_sorted_total_chroms.vcf"} - - } - - if (params.demuxlet == "True"){ - // This will be only run if the genotype provided is not None - bam = params.demuxlet_preprocess == 'True'? qc_bam: input_bam //qc_bam.mix(input_bam)) - demultiplex_demuxlet(bam) - demuxlet_out = demultiplex_demuxlet.out - } - else{ - demuxlet_out = channel.value("no_result") - } - - - if (params.freemuxlet == "True"){ - // This will be run regardless if the genotype is provided to pipeline or not since freemuxlet is a genotype absent mode. - - bam = params.freemuxlet_preprocess == 'True'? qc_bam: input_bam // qc_bam.mix(input_bam)) - - demultiplex_freemuxlet(bam) - freemuxlet_out = demultiplex_freemuxlet.out - } - else{ - freemuxlet_out = channel.value("no_result") - } - - - if (params.vireo == "True"){ - vcf = params.vireo_variant != 'True'? Channel.fromPath(params.celldata): - variant_cellSNP.out.map{ return it + "/*/cellSNP.cells.vcf"} - //variant_cellSNP.out.map{ return it + "/*/cellSNP.cells.vcf"}.mix(Channel.fromPath(params.celldata))) - demultiplex_vireo(vcf) - vireo_out = demultiplex_vireo.out - } - else{ - vireo_out = channel.value("no_result") - } - - if (params.scSplit == "True"){ - bam = params.scSplit_preprocess == 'True'? qc_bam: input_bam // qc_bam.mix(input_bam)) - bai = params.scSplit_preprocess == 'True'? qc_bam_bai: input_bai // qc_bam_bai.mix(input_bai)) - vcf = params.scSplit_variant != 'True'? Channel.fromPath(params.vcf_mixed): - filter_variant.out.map{ return it + "/filtered_sorted_total_chroms.vcf"} - //freebayes_vcf.mix(Channel.fromPath(params.vcf_mixed))) - demultiplex_scSplit(bam, vcf, bai) - scSplit_out = demultiplex_scSplit.out - } - else{ - scSplit_out = channel.value("no_result") - } - - if (params.souporcell == "True"){ - bam = params.souporcell_preprocess == 'True'? qc_bam: input_bam // qc_bam.mix(input_bam)) - demultiplex_souporcell(bam) - souporcell_out = demultiplex_souporcell.out - } - else{ - souporcell_out = channel.value("no_result") - } - - summary(demuxlet_out, freemuxlet_out, vireo_out, souporcell_out, scSplit_out, - params.generate_anndata, params.generate_mudata, - params.rna_matrix_filtered, params.hto_matrix_filtered) - emit: - summary.out -} - diff --git a/modules/single/hash_demulti/bff.nf b/modules/single/hash_demulti/bff.nf deleted file mode 100644 index 1160786..0000000 --- a/modules/single/hash_demulti/bff.nf +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env nextflow -nextflow.enable.dsl=2 - -process bff{ - publishDir "$projectDir/$params.outdir/$params.mode/hash_demulti/bff", mode:'copy' - label 'small_mem' - - conda "$projectDir/conda/bff.yml" - - - input: - - path hto_matrix, stageAs: 'hto_data' - each methods - each methodsForConsensus - each cellbarcodeWhitelist - each metricsFile - each doTSNE - each doHeatmap - each perCellSaturation - each majorityConsensusThreshold - each chemistry - each callerDisagreementThreshold - each assignmentOutBff - each preprocess_bff - each barcodeWhitelist - - output: - path "bff_${task.index}" - - - script: - - """ - mkdir bff_${task.index} - bff.R --fileHto hto_data --methods $methods --methodsForConsensus $methodsForConsensus \ - --cellbarcodeWhitelist $cellbarcodeWhitelist --metricsFile bff_${task.index}_$metricsFile \ - --doTSNE $doTSNE --doHeatmap $doHeatmap --perCellSaturation $perCellSaturation --majorityConsensusThreshold $majorityConsensusThreshold \ - --chemistry $chemistry --callerDisagreementThreshold $callerDisagreementThreshold --outputdir bff_${task.index} --assignmentOutBff $assignmentOutBff \ - --preprocess $preprocess_bff --barcodeWhitelist $barcodeWhitelist - """ - -} - -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() - } - else{ - Channel.from(input) - } -} - -workflow bff_hashing{ - take: - hto_matrix - main: - methods = split_input(params.methods) - methodsForConsensus = split_input(params.methodsForConsensus) - cellbarcodeWhitelist = split_input(params.cellbarcodeWhitelist) - metricsFile = split_input(params.metricsFile) - doTSNE = split_input(params.doTSNE) - doHeatmap = split_input(params.doHeatmap) - perCellSaturation = split_input(params.perCellSaturation) - majorityConsensusThreshold = split_input(params.majorityConsensusThreshold) - chemistry = split_input(params.chemistry) - callerDisagreementThreshold = split_input(params.callerDisagreementThreshold) - assignmentOutBff = split_input(params.assignmentOutBff) - preprocess_bff = split_input(params.preprocess_bff) - barcodeWhitelist = split_input(params.barcodeWhitelist) - - bff(hto_matrix, methods, methodsForConsensus,cellbarcodeWhitelist, metricsFile,doTSNE,doHeatmap,perCellSaturation,majorityConsensusThreshold,chemistry,callerDisagreementThreshold,assignmentOutBff,preprocess_bff,barcodeWhitelist) - - emit: - bff.out.collect() -} - - -workflow{ - bff_hashing() - -} diff --git a/modules/single/hash_demulti/demuxem.nf b/modules/single/hash_demulti/demuxem.nf deleted file mode 100644 index 32d59b3..0000000 --- a/modules/single/hash_demulti/demuxem.nf +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env nextflow -nextflow.enable.dsl=2 - -process demuxem{ - publishDir "$projectDir/$params.outdir/$params.mode/hash_demulti/demuxem", mode:'copy' - label 'small_mem' - - conda "bioconda::pegasuspy demuxEM scanpy" - - input: - path raw_rna_matrix_dir, stageAs: "rna_data_${params.rna_matrix_demuxem}" - path raw_hto_matrix_dir, stageAs: "hto_data_${params.hto_matrix_demuxem}" - val threads - each alpha - each alpha_noise - each tol - each min_num_genes - each min_num_umis - each min_signal - each random_state - each generate_gender_plot - val objectOutDemuxem - each filter_demuxem - output: - path "demuxem_${task.index}" - - script: - def generateGenderPlot = generate_gender_plot != "None" ? " --generateGenderPlot ${generate_gender_plot}" : '' - """ - mkdir demuxem_${task.index} - demuxem.py --rna_matrix_dir rna_data_${params.rna_matrix_demuxem} --hto_matrix_dir hto_data_${params.hto_matrix_demuxem} \ - --randomState $random_state --min_signal $min_signal --tol $tol --min_num_genes $min_num_genes --min_num_umis $min_num_umis \ - --alpha $alpha --alpha_noise $alpha_noise --n_threads $threads $generateGenderPlot --objectOutDemuxem $objectOutDemuxem \ - --outputdir demuxem_${task.index} --filter_demuxem $filter_demuxem - """ - -} - -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() - } - else{ - Channel.from(input) - } -} - -workflow demuxem_hashing{ - take: - hto_matrix - rna_matrix - main: - threads = params.threads_demuxem - alpha = split_input(params.alpha_demuxem) - alpha_noise = split_input(params.alpha_noise) - min_num_genes = split_input(params.min_num_genes) - min_num_umis = split_input(params.min_num_umis) - min_signal = split_input(params.min_signal) - tol = split_input(params.tol) - random_state = split_input(params.random_state) - generate_gender_plot = split_input(params.generate_gender_plot) - objectOutDemuxem = params.objectOutDemuxem - filter_demuxem = split_input(params.filter_demuxem) - - demuxem(rna_matrix, hto_matrix, threads, alpha, alpha_noise, tol, min_num_genes, min_num_umis, - min_signal, random_state, generate_gender_plot, objectOutDemuxem, filter_demuxem) - - emit: - demuxem.out.collect() -} \ No newline at end of file diff --git a/modules/single/hash_demulti/gmm_demux.nf b/modules/single/hash_demulti/gmm_demux.nf deleted file mode 100644 index 90a0672..0000000 --- a/modules/single/hash_demulti/gmm_demux.nf +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env nextflow -nextflow.enable.dsl=2 - -process gmm_demux{ - publishDir "$projectDir/$params.outdir/$sampleId/$params.mode/hash_demulti/gmm_demux", mode:'copy' - label 'small_mem' - conda "$projectDir/conda/gmm_demux.yml" - - input: - tuple val(sampleId), path(filtered_hto_matrix_dir), val(hto_name_gmm) - //HTO names as string separated by commas - //val hto_name_gmm - //mode 2 - //need estimate number of cells in the single cell assay - //obligatory - val summary - //need to be combined with summary to get a report as file - val report_gmm - //mode 4 - // write csv or tsv - type of input - val mode_GMM - //case 5 - val extract - //float between 0 and 1 - val threshold_gmm - val ambiguous - - - - output: - path "gmm_demux_${sampleId}" - - script: - def extract_droplets = extract != 'None' ? " -x ${extract}" : '' - def ambiguous_droplets = extract != 'None' ? " --ambiguous ${ambiguous}" : '' - - if(mode_GMM=="csv"){ - """ - mkdir gmm_demux_${sampleId} - touch gmm_demux_${sampleId}_$report_gmm - - GMM-demux -c $filtered_hto_matrix_dir $hto_name_gmm -u $summary --report gmm_demux_${sampleId}_$report_gmm --full gmm_demux_${sampleId} $extract_droplets -t $threshold_gmm - gmm_demux_params.py --path_hto $filtered_hto_matrix_dir --hto_name_gmm $hto_name_gmm --summary $summary --report gmm_demux_${sampleId}_$report_gmm --mode $mode_GMM $extract_droplets --threshold_gmm $threshold_gmm $ambiguous_droplets --outputdir gmm_demux_${sampleId} - - """ - }else { - """ - mkdir gmm_demux_${sampleId} - touch gmm_demux_${sampleId}_$report_gmm - - GMM-demux $filtered_hto_matrix_dir $hto_name_gmm -u $summary -r gmm_demux_${sampleId}_$report_gmm --full gmm_demux_${sampleId} -o gmm_demux_${sampleId} $extract_droplets -t $threshold_gmm - gmm_demux_params.py --path_hto $filtered_hto_matrix_dir --hto_name_gmm $hto_name_gmm --summary $summary --report gmm_demux_${sampleId}_$report_gmm --mode $mode_GMM $extract_droplets --threshold_gmm $threshold_gmm $ambiguous_droplets --outputdir gmm_demux_${sampleId} - - """ - } - - -} - - -workflow gmm_demux_hashing{ -take: - hto_matrix - main: - summary = params.summary - report_gmm = params.report_gmm - mode = params.mode_GMM - extract = params.extract - threshold_gmm = params.threshold_gmm - ambiguous = params.ambiguous - - gmm_demux(hto_matrix,summary,report_gmm,mode,extract,threshold_gmm,ambiguous) - - emit: - gmm_demux.out.collect() -} - - -workflow{ - gmm_demux_hashing() - -} diff --git a/modules/single/hash_demulti/hashedDrops.nf b/modules/single/hash_demulti/hashedDrops.nf deleted file mode 100755 index ec4bd43..0000000 --- a/modules/single/hash_demulti/hashedDrops.nf +++ /dev/null @@ -1,108 +0,0 @@ -#!/usr/bin/env nextflow -nextflow.enable.dsl=2 - -process hashedDrops{ - publishDir "$projectDir/$params.outdir/$params.mode/hash_demulti/hashedDrops", mode:'copy' - label 'small_mem' - - conda "conda-forge::r-seurat conda-forge::r-argparse bioconda::bioconductor-dropletutils" - - input: - path raw_hto_matrix_dir - each lower - each niters - each testAmbient - each ignore - each alpha - each round - each byRank - each isCellFDR - val objectOutEmptyDrops - val assignmentOutEmptyDrops - each runEmptyDrops - - each ambient - each minProp - each pseudoCount - each constantAmbient - each doubletNmads - each doubletMin - each doubletMixture - each confidentNmads - each confidenMin - each combinations - val objectOutHashedDrops - val assignmentOutHashedDrops - each gene_col - output: - path "hashedDrops_${task.index}" - - script: - def testAmb = testAmbient != 'False' ? " --testAmbient" : '' - def rou = round != 'False' ? " --round" : '' - def constantAmb = constantAmbient != 'False' ? " --constantAmbient" : '' - def doubletMix = doubletMixture != 'False' ? " --doubletMixture" : '' - def ign = ignore != "None" ? " --ignore ${ignore}" : '' - def alp = alpha != "None" ? " --alpha ${alpha}" : '' - def byR = byRank != "None" ? " --by.rank ${byRank}" : '' - def amb = ambient != 'False' ? " --ambient" : '' - def run_empty = runEmptyDrops != 'False' ? " --runEmptyDrops" : '' - def comb = combinations != "None" ? " --combinations ${combinations}" : '' - - """ - mkdir hashedDrops_${task.index} - dropletUtils.R --raw_hto_matrix_dir $raw_hto_matrix_dir --lower $lower --niters $niters --isCellFDR $isCellFDR --objectOutEmptyDrops $objectOutEmptyDrops --assignmentOutEmptyDrops $assignmentOutEmptyDrops --minProp $minProp --pseudoCount $pseudoCount --doubletNmads $doubletNmads --doubletMin $doubletMin --confidentNmads $confidentNmads --confidenMin $confidenMin --objectOutHashedDrops $objectOutHashedDrops --outputdir hashedDrops_${task.index} --assignmentOutHashedDrops ${assignmentOutHashedDrops}${testAmb}${ign}${alp}${rou}${byR}${constantAmb}${doubletMix}${amb}${comb}${run_empty} --gene_col $gene_col - """ - -} - - -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() - } - else{ - Channel.from(input) - } -} - - -workflow hashedDrops_hashing{ - take: - hto_matrix - main: - lower = split_input(params.lower) - niters = split_input(params.niters) - testAmbient = split_input(params.testAmbient) - ignore = split_input(params.ignore_hashedDrops) - alpha = split_input(params.alpha_hashedDrops) - round = split_input(params.round) - byRank = split_input(params.byRank) - isCellFDR = split_input(params.isCellFDR) - objectOutEmptyDrops = params.objectOutEmptyDrops - assignmentOutEmptyDrops = params.assignmentOutEmptyDrops - runEmptyDrops = split_input(params.runEmptyDrops) - - ambient = split_input(params.ambient) - minProp = split_input(params.minProp) - pseudoCount = split_input(params.pseudoCount) - constantAmbient = split_input(params.constantAmbient) - doubletNmads = split_input(params.doubletNmads) - doubletMin = split_input(params.doubletMin) - doubletMixture = split_input(params.doubletMixture) - confidentNmads = split_input(params.confidentNmads) - confidenMin = split_input(params.confidenMin) - combinations = split_input(params.combinations) - objectOutHashedDrops = params.objectOutHashedDrops - assignmentOutHashedDrops = params.assignmentOutHashedDrops - gene_col = split_input(params.gene_col) - - hashedDrops(hto_matrix, lower, niters, testAmbient, ignore, alpha, round, byRank, isCellFDR, objectOutEmptyDrops, assignmentOutEmptyDrops,runEmptyDrops, ambient, minProp, pseudoCount, constantAmbient, doubletNmads, doubletMin, doubletMixture, confidentNmads, confidenMin, combinations, objectOutHashedDrops, assignmentOutHashedDrops, gene_col) - - emit: - hashedDrops.out.collect() -} - -workflow{ - hashedDrops_hashing() -} diff --git a/modules/single/hash_demulti/hashsolo.nf b/modules/single/hash_demulti/hashsolo.nf deleted file mode 100755 index e6ce254..0000000 --- a/modules/single/hash_demulti/hashsolo.nf +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env nextflow -nextflow.enable.dsl=2 -process hash_solo{ - publishDir "$projectDir/$params.outdir/$params.mode/hash_demulti/hashsolo", mode:'copy' - label 'small_mem' - - conda "$projectDir/conda/hashsolo_py.yml" - - input: - path hto_data, stageAs: "hto_data_${params.hto_matrix_hashsolo}" - each priors_negative - each priors_singlet - each priors_doublet - each pre_existing_clusters - path rna_data, stageAs: "rna_data_${params.rna_matrix_hashsolo}" - val use_rna_data - each number_of_noise_barcodes - val assignmentOutHashSolo - val plotOutHashSolo - - output: - path "hashsolo_${task.index}" - - script: - def noise_barcodes = number_of_noise_barcodes != "None" ? "--number_of_noise_barcodes $number_of_noise_barcodes" : '' - def existing_clusters = pre_existing_clusters != "None" ? "--pre_existing_clusters $pre_existing_clusters" : '' - def clustering_data = use_rna_data != 'False' ? "--clustering_data rna_data_${params.rna_matrix_hashsolo}" : '' - """ - mkdir hashsolo_${task.index} - hashsolo.py --hto_data hto_data_${params.hto_matrix_hashsolo} --priors $priors_negative $priors_singlet $priors_doublet \ - $existing_clusters $clustering_data $noise_barcodes \ - --assignmentOutHashSolo $assignmentOutHashSolo \ - --plotOutHashSolo $plotOutHashSolo --outputdir hashsolo_${task.index} - """ - -} - - -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() - } - else{ - Channel.from(input) - } -} - -workflow hash_solo_hashing { - take: - hto_matrix - rna_matrix - main: - use_rna_data = split_input(params.use_rna_data) - priors_negative = split_input(params.priors_negative) - priors_singlet = split_input(params.priors_singlet) - priors_doublet = split_input(params.priors_doublet) - pre_existing_clusters = split_input(params.pre_existing_clusters) - number_of_noise_barcodes = split_input(params.number_of_noise_barcodes) - assignmentOutHashSolo = params.assignmentOutHashSolo - plotOutHashSolo = params.plotOutHashSolo - - hash_solo(hto_matrix, priors_negative, priors_singlet, priors_doublet, pre_existing_clusters, rna_matrix, use_rna_data, number_of_noise_barcodes, assignmentOutHashSolo, plotOutHashSolo) - emit: - hash_solo.out.collect() -} \ No newline at end of file diff --git a/modules/single/hash_demulti/htodemux.nf b/modules/single/hash_demulti/htodemux.nf deleted file mode 100644 index 317594b..0000000 --- a/modules/single/hash_demulti/htodemux.nf +++ /dev/null @@ -1,114 +0,0 @@ -#!/usr/bin/env nextflow -nextflow.enable.dsl=2 - -process htodemux{ - publishDir "$projectDir/$params.outdir/$params.mode/hash_demulti/htodemux", mode: 'copy' - label 'small_mem' - - conda "conda-forge::r-seurat conda-forge::r-argparse" - - input: - each seurat_object - val assay - each quantile - each kfunc - each nstarts - each nsamples - each seed - each init - val objectOutHTO - val assignmentOutHTO - - //Ridge plot params - each ridgePlot - each ridgeNCol - //Scatter features params - each featureScatter - each scatterFeat1 - each scatterFeat2 - //Violin plot params - each vlnplot - each vlnFeatures - each vlnLog - //tSNE - each tsne - each tsneIdents - each tsneInvert - each tsneVerbose - each tsneApprox - each tsneDimMax - each tsnePerplexity - //Heatmap - each heatmap - each heatmapNcells - - output: - path "htodemux_${task.index}" - - script: - def init_val = init != 'None' ? " --init $init" : '' - def vln_log = vlnLog != 'False' ? "--vlnLog" : '' - def invert = tsneInvert != 'False' ? "--tSNEInvert" : '' - def verbose = tsneVerbose != 'False' ? "--tSNEVerbose" : '' - def approx = tsneApprox != 'False' ? "--tSNEApprox" : '' - - """ - mkdir htodemux_${task.index} - HTODemux.R --seuratObject $seurat_object --assay $assay --quantile $quantile --kfunc $kfunc --nstarts $nstarts --nsamples $nsamples --seed $seed $init_val --objectOutHTO $objectOutHTO --assignmentOutHTO $assignmentOutHTO --outputdir htodemux_${task.index} - HTODemux-visualisation.R --hashtagPath htodemux_${task.index}/${objectOutHTO}.rds --assay $assay --ridgePlot $ridgePlot --ridgeNCol $ridgeNCol --featureScatter $featureScatter --scatterFeat1 $scatterFeat1 --scatterFeat2 $scatterFeat2 --vlnPlot $vlnplot --vlnFeatures $vlnFeatures $vln_log --tSNE $tsne --tSNEIdents $tsneIdents $invert $verbose $approx --tSNEDimMax $tsneDimMax --tSNEPerplexity $tsnePerplexity --heatMap $heatmap --heatMapNcells $heatmapNcells --outputdir htodemux_${task.index} - """ - -} - -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() - } - else{ - Channel.from(input) - } -} - - -workflow htodemux_hashing{ - take: - seurat_object - main: - quantile = split_input(params.quantile_htodemux) - assay = params.assay - kfunc = split_input(params.kfunc) - nstarts = split_input(params.nstarts) - nsamples = split_input(params.nsamples) - seed = split_input(params.seed) - init = split_input(params.init) - objectOutHTO = params.objectOutHTO - assignmentOutHTO = params.assignmentOutHTO - - ridgePlot = split_input(params.ridgePlot) - ridgeNCol = split_input(params.ridgeNCol) - featureScatter = split_input(params.featureScatter) - scatterFeat1 = split_input(params.scatterFeat1) - scatterFeat2 = split_input(params.scatterFeat2) - vlnplot = split_input(params.vlnplot) - vlnFeatures = split_input(params.vlnFeatures) - vlnLog = split_input(params.vlnLog) - - tsne = split_input(params.tsne) - tsneIdents = split_input(params.tsneIdents) - tsneInvert = split_input(params.tsneInvert) - tsneVerbose = split_input(params.tsneVerbose) - tsneApprox = split_input(params.tsneApprox) - tsneDimMax = split_input(params.tsneDimMax) - tsnePerplexity = split_input(params.tsnePerplexity) - heatmap = split_input(params.heatmap) - heatmapNcells = split_input(params.heatmapNcells) - - htodemux(seurat_object, assay, quantile, kfunc, nstarts, nsamples, seed, init, objectOutHTO, assignmentOutHTO, ridgePlot, ridgeNCol, featureScatter, scatterFeat1, scatterFeat2, vlnplot, vlnFeatures, vlnLog, tsne, tsneIdents, tsneInvert, tsneVerbose, tsneApprox, tsneDimMax, tsnePerplexity, heatmap, heatmapNcells) - - emit: - htodemux.out.collect() -} - -workflow{ - htodemux_hashing() -} diff --git a/modules/single/hash_demulti/multiseq.nf b/modules/single/hash_demulti/multiseq.nf deleted file mode 100644 index f56442c..0000000 --- a/modules/single/hash_demulti/multiseq.nf +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env nextflow -nextflow.enable.dsl=2 - -process multi_seq{ - publishDir "$projectDir/$params.outdir/$params.mode/hash_demulti/multiseq", mode:'copy' - label 'small_mem' - - conda "conda-forge::r-seurat conda-forge::r-argparse" - - input: - each rdsObject - each quantile - each autoThresh - each maxiter - each qrangeFrom - each qrangeTo - each qrangeBy - each verbose - val assay - val objectOutMulti - val assignmentOutMulti - - output: - path "multiseq_${task.index}" - - script: - def autoThr = autoThresh != 'False' ? " --autoThresh" : '' - def verb = verbose != 'False' ? " --verbose" : '' - - """ - mkdir multiseq_${task.index} - MultiSeq.R --seuratObjectPath $rdsObject --assay $assay --quantile $quantile $autoThr --maxiter $maxiter --qrangeFrom $qrangeFrom --qrangeTo $qrangeTo --qrangeBy $qrangeBy $verb --objectOutMulti $objectOutMulti --assignmentOutMulti $assignmentOutMulti --outputdir multiseq_${task.index} - """ -} - -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() - } - else{ - Channel.from(input) - } -} - -workflow multiseq_hashing{ - take: - rdsObject - main: - quantile = split_input(params.quantile_multi) - autoThresh = split_input(params.autoThresh) - maxIter = split_input(params.maxiter) - qrangeFrom = split_input(params.qrangeFrom) - qrangeTo = split_input(params.qrangeTo) - qrangeBy = split_input(params.qrangeBy) - verbose = split_input(params.verbose_multiseq) - assay = params.assay - objectOutMulti = params.objectOutMulti - assignmentOutMulti = params.assignmentOutMulti - multi_seq(rdsObject, quantile, autoThresh, maxIter, qrangeFrom, qrangeTo, qrangeBy, verbose, assay, objectOutMulti, assignmentOutMulti) - emit: - multi_seq.out.collect() -} diff --git a/modules/single/hash_demulti/preprocess.nf b/modules/single/hash_demulti/preprocess.nf deleted file mode 100644 index 422931e..0000000 --- a/modules/single/hash_demulti/preprocess.nf +++ /dev/null @@ -1,65 +0,0 @@ -process preprocess{ - publishDir "$projectDir/$params.outdir/$params.mode/hash_demulti/preprocess", mode:'copy' - label 'small_mem' - - conda "conda-forge::r-seurat conda-forge::r-argparse" - - input: - path hto_matrix, stageAs: 'hto_data' - path umi_matrix, stageAs: 'rna_data' - val hto_raw_or_filtered - val rna_raw_or_filtered - val ndelim - each selection_method - each number_features - val assay - each margin - val normalisation_method - val preprocess_out - each gene_col - - output: - path "preprocess_${task.index}_hto_${hto_raw_or_filtered}_rna_${rna_raw_or_filtered}" - - script: - - """ - mkdir preprocess_${task.index}_hto_${hto_raw_or_filtered}_rna_${rna_raw_or_filtered} - pre_processing.R --fileUmi rna_data --fileHto hto_data --ndelim $ndelim \ - --selectMethod $selection_method --numberFeatures $number_features --assay $assay \ - --margin $margin --normalisationMethod $normalisation_method --OutputFile $preprocess_out \ - --outputdir preprocess_${task.index}_hto_${hto_raw_or_filtered}_rna_${rna_raw_or_filtered} --gene_col $gene_col - """ - - -} - - -def split_input(input){ - if (input =~ /;/ ){ - Channel.from(input).map{ return it.tokenize(';')}.flatten() - } - else{ - Channel.from(input) - } -} - -workflow preprocessing_hashing{ - take: - hto_matrix - rna_matrix - hto_raw_or_filtered - rna_raw_or_filtered - main: - sel_method = split_input(params.sel_method) - ndelim = params.ndelim - n_features = split_input(params.n_features) - assay = params.assay - margin = split_input(params.margin) - norm_method = split_input(params.norm_method) - out_file = params.preprocessOut - gene_col = split_input(params.gene_col) - preprocess(hto_matrix, rna_matrix, hto_raw_or_filtered, rna_raw_or_filtered, ndelim, sel_method, n_features, assay, margin, norm_method,out_file,gene_col) - emit: - preprocess.out.collect() -} \ No newline at end of file diff --git a/modules/single/hash_demultiplexing.nf b/modules/single/hash_demultiplexing.nf deleted file mode 100644 index e96f6f1..0000000 --- a/modules/single/hash_demultiplexing.nf +++ /dev/null @@ -1,179 +0,0 @@ -#!/usr/bin/env nextflow -nextflow.enable.dsl=2 - -include { preprocessing_hashing as preprocessing_hashing_htodemux } from './hash_demulti/preprocess' -include { preprocessing_hashing as preprocessing_hashing_multiseq } from './hash_demulti/preprocess' -include { multiseq_hashing } from './hash_demulti/multiseq' -include { htodemux_hashing } from './hash_demulti/htodemux' -include { hash_solo_hashing } from './hash_demulti/hashsolo' -include { hashedDrops_hashing } from './hash_demulti/hashedDrops' -include { demuxem_hashing } from './hash_demulti/demuxem' -include { gmm_demux_hashing } from './hash_demulti/gmm_demux' -include { bff_hashing } from './hash_demulti/bff' - -process summary{ - publishDir "$projectDir/$params.outdir/$params.mode/hash_demulti", mode: 'copy' - label 'small_mem' - - conda "pandas scanpy mudata" - - input: - val demuxem_result - val hashsolo_result - val htodemux_result - val multiseq_result - val hashedDrops_result - val gmmDemux_result - val bff_result - val generate_anndata - val generate_mudata - path rna_matrix, stageAs: 'rna_data' - path hto_matrix, stageAs: 'hto_data' - - output: - path hash_summary - - script: - def demuxem_files = "" - def htodemux_files = "" - def hashsolo_files = "" - def multiseq_files = "" - def hashedDrops_files = "" - def gmmDemux_files = "" - def bff_files = "" - def generate_adata = "" - def generate_mdata = "" - - if (demuxem_result != "no_result"){ - demuxem_files = "--demuxem ${demuxem_result.join(":")}" - } - if (hashsolo_result != "no_result"){ - hashsolo_files = "--hashsolo ${hashsolo_result.join(":")}" - } - if (htodemux_result != "no_result"){ - htodemux_files = "--htodemux ${htodemux_result.join(":")}" - } - if (multiseq_result != "no_result"){ - multiseq_files = "--multiseq ${multiseq_result.join(":")}" - } - if (hashedDrops_result != "no_result"){ - hashedDrops_files = "--hashedDrops ${hashedDrops_result.join(":")}" - } - if (gmmDemux_result != "no_result"){ - gmmDemux_files = "--gmm_demux ${gmmDemux_result.join(":")}" - } - if (bff_result != "no_result"){ - bff_files = "--bff ${bff_result.join(":")}" - } - if (generate_anndata == "True"){ - if(rna_matrix.name == "None"){ - error "Error: RNA count matrix is not given." - } - generate_adata = "--generate_anndata --read_rna_mtx rna_data" - } - if (generate_mudata == "True"){ - if(rna_matrix.name == "None"){ - error "Error: RNA count matrix is not given." - } - if(hto_matrix.name == "None"){ - error "Error: HTO count matrix is not given." - } - generate_mdata = "--generate_mudata --read_rna_mtx rna_data --read_hto_mtx hto_data" - } - - """ - summary_hash.py $demuxem_files $htodemux_files $multiseq_files $hashedDrops_files $hashsolo_files $gmmDemux_files $bff_files $generate_adata $generate_mdata - """ -} - - -workflow hash_demultiplexing{ - take: - rna_matrix_raw - rna_matrix_filtered - hto_matrix_raw - hto_matrix_filtered - main: - - if (params.htodemux == "True"){ - rna_matrix = params.rna_matrix_htodemux == "raw" ? rna_matrix_raw : rna_matrix_filtered - hto_matrix = params.hto_matrix_htodemux == "raw" ? hto_matrix_raw : hto_matrix_filtered - preprocessing_hashing_htodemux(hto_matrix, rna_matrix, params.hto_matrix_htodemux, params.rna_matrix_htodemux) - htodemux_preprocess_out = preprocessing_hashing_htodemux.out - htodemux_hashing(htodemux_preprocess_out) - htodemux_out = htodemux_hashing.out - } - else{ - htodemux_out = channel.value("no_result") - } - - if (params.multiseq == "True"){ - if (params.htodemux == "True" & params.hto_matrix_htodemux == params.hto_matrix_multiseq & - params.rna_matrix_htodemux == params.rna_matrix_multiseq){ - multiseq_preprocess_out = htodemux_preprocess_out - } - else{ - rna_matrix = params.rna_matrix_multiseq == "raw" ? rna_matrix_raw : rna_matrix_filtered - hto_matrix = params.hto_matrix_multiseq == "raw" ? hto_matrix_raw : hto_matrix_filtered - preprocessing_hashing_multiseq(hto_matrix, rna_matrix, params.hto_matrix_multiseq, params.rna_matrix_multiseq) - multiseq_preprocess_out = preprocessing_hashing_multiseq.out - } - multiseq_hashing(multiseq_preprocess_out) - multiseq_out = multiseq_hashing.out - } - else{ - multiseq_out = channel.value("no_result") - } - - if (params.hashsolo == "True"){ - hashsolo_hto_input = params.hto_matrix_hashsolo == "raw" ? hto_matrix_raw : hto_matrix_filtered - hashsolo_rna_input = params.rna_matrix_hashsolo == "False" ? channel.value("None") : - (params.rna_matrix_hashsolo == "raw" ? rna_matrix_raw : rna_matrix_filtered) - hash_solo_hashing(hashsolo_hto_input, hashsolo_rna_input) - hashsolo_out = hash_solo_hashing.out - } - else{ - hashsolo_out = channel.value("no_result") - } - - if (params.demuxem == "True"){ - demuxem_hto_input = params.hto_matrix_demuxem == "raw" ? hto_matrix_raw : hto_matrix_filtered - demuxem_rna_input = params.rna_matrix_demuxem == "raw" ? rna_matrix_raw : rna_matrix_filtered - demuxem_hashing(demuxem_hto_input, demuxem_rna_input) - demuxem_out = demuxem_hashing.out - } - else{ - demuxem_out = channel.value("no_result") - } - - if (params.hashedDrops == "True"){ - hashedDrops_hto_input = params.hto_matrix_hashedDrops == "raw" ? hto_matrix_raw : hto_matrix_filtered - hashedDrops_hashing(hashedDrops_hto_input) - hashedDrops_out = hashedDrops_hashing.out - } - else{ - hashedDrops_out = channel.value("no_result") - } - if (params.bff == "True"){ - bff_hto_input = params.hto_matrix_bff == "raw" ? hto_matrix_raw : hto_matrix_filtered - bff_hashing(bff_hto_input) - bff_out = bff_hashing.out - } - else{ - bff_out = channel.value("no_result") - } - if (params.gmmDemux == "True"){ - gmmDemux_hto_input = params.hto_matrix_gmm_demux == "raw" ? hto_matrix_raw : hto_matrix_filtered - gmm_demux_hashing(gmmDemux_hto_input) - gmmDemux_out = gmm_demux_hashing.out - } - else{ - gmmDemux_out = channel.value("no_result") - } - - - summary(demuxem_out, hashsolo_out, htodemux_out, multiseq_out, hashedDrops_out,gmmDemux_out,bff_out, - params.generate_anndata, params.generate_mudata, rna_matrix_filtered, hto_matrix_filtered) - emit: - summary.out -} diff --git a/modules/single_demultiplexing.nf b/modules/single_demultiplexing.nf deleted file mode 100644 index 2f2a871..0000000 --- a/modules/single_demultiplexing.nf +++ /dev/null @@ -1,118 +0,0 @@ -#!/usr/bin/env nextflow -nextflow.enable.dsl=2 - -include { gene_demultiplexing } from "$projectDir/modules/single/gene_demultiplexing" -include { hash_demultiplexing } from "$projectDir/modules/single/hash_demultiplexing" -include { donor_match } from "$projectDir/modules/single/donor_match" - - -process generate_data{ - publishDir "$params.outdir/$params.mode/data_output", mode: 'copy' - - conda "pandas scanpy mudata" - - input: - path assignment - val generate_anndata - val generate_mudata - val rna_matrix - val hto_matrix - output: - path "adata_with_donor_matching.h5ad", optional: true - path "mudata_with_donor_matching.h5mu", optional: true - - - script: - def generate_adata = "" - def generate_mdata = "" - - if (generate_anndata == "True"){ - if(rna_matrix == "None"){ - error "Error: RNA count matrix is not given." - } - generate_adata = "--generate_anndata --read_rna_mtx $rna_matrix" - } - if (generate_mudata == "True"){ - if(rna_matrix == "None"){ - error "Error: RNA count matrix is not given." - } - if(hto_matrix == "None"){ - error "Error: HTO count matrix is not given." - } - generate_mdata = "--generate_mudata --read_rna_mtx $rna_matrix --read_hto_mtx $hto_matrix" - } - - """ - generate_data.py --assignment $assignment $generate_adata $generate_mdata - """ -} - -process summary_all{ - publishDir "$params.outdir/$params.mode", mode: 'copy' - label 'small_mem' - - conda "pandas scanpy mudata" - - input: - path gene_demulti_result - path hash_demulti_result - output: - path "summary" - - script: - """ - summary.py --gene_demulti $gene_demulti_result --hash_demulti $hash_demulti_result - """ -} - - - - -workflow run_single{ - - print("-----Running single sample-----") - - if (params.mode == "genetic"){ - - // Performing genetic demultiplexing methodologies - gene_demultiplexing() - if (params.match_donor == "True"){ - donor_match(gene_demultiplexing.out) - } - } - else if (params.mode == "hashing"){ - - // Performing hashing demultplexing - hash_demultiplexing(params.rna_matrix_raw, params.rna_matrix_filtered, params.hto_matrix_raw, params.hto_matrix_filtered) - if (params.match_donor == "True"){ - donor_match(hash_demultiplexing.out) - } - } - else if (params.mode == "rescue"){ - - // Performing both hashing and genetic demultiplexing methods - hash_demultiplexing(params.rna_matrix_raw, params.rna_matrix_filtered, params.hto_matrix_raw, params.hto_matrix_filtered) - gene_demultiplexing() - gene_summary = gene_demultiplexing.out - hash_summary = hash_demultiplexing.out - summary_all(gene_summary, hash_summary) - - if (params.match_donor == "True"){ - donor_match(summary_all.out) - if (params.generate_anndata == "True" || params.generate_mudata == "True" ){ - generate_data(donor_match.out, params.generate_anndata, params.generate_mudata, - params.rna_matrix_filtered, params.hto_matrix_filtered) - } - } - } - else if (params.mode == "donor_match"){ - - // Performing just donor matching - donor_match(params.demultiplexing_result) - if (params.generate_anndata == "True" || params.generate_mudata == "True" ){ - generate_data(donor_match.out, params.generate_anndata, params.generate_mudata, - params.rna_matrix_filtered, params.hto_matrix_filtered) - } - - } -} \ No newline at end of file diff --git a/subworkflows/HADGE.nf b/subworkflows/HADGE.nf index 2ce5c35..85b5c79 100644 --- a/subworkflows/HADGE.nf +++ b/subworkflows/HADGE.nf @@ -1,7 +1,6 @@ nextflow.enable.dsl=2 include { run_multi } from "$projectDir/modules/multi_demultiplexing" -include {run_single} from "$projectDir/modules/single_demultiplexing" include { summary } from "$projectDir/modules/multi/gene_demultiplexing" include { donor_match } from "$projectDir/modules/multi/donor_match" include {create_single_chanel_input} from "$projectDir/modules/multi/preprocessing/preprocessing" From f6d47c443b63029a3b017b3b40700f008f0f81f8 Mon Sep 17 00:00:00 2001 From: Matiss Ozols Date: Wed, 7 Feb 2024 09:22:54 +0000 Subject: [PATCH 07/16] changed chanel for scanpy --- conda/condaenv.31vp35_d.requirements.txt | 5 ----- modules/multi/hash_demulti/demuxem.nf | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) delete mode 100644 conda/condaenv.31vp35_d.requirements.txt diff --git a/conda/condaenv.31vp35_d.requirements.txt b/conda/condaenv.31vp35_d.requirements.txt deleted file mode 100644 index b1be65e..0000000 --- a/conda/condaenv.31vp35_d.requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -solo-sc -scanpy -argparse -torchmetrics==0.6.0 -matplotlib \ No newline at end of file diff --git a/modules/multi/hash_demulti/demuxem.nf b/modules/multi/hash_demulti/demuxem.nf index 6ae69fc..8a7932e 100644 --- a/modules/multi/hash_demulti/demuxem.nf +++ b/modules/multi/hash_demulti/demuxem.nf @@ -5,7 +5,7 @@ process demuxem{ publishDir "$params.outdir/$sampleId/$params.mode/hash_demulti/demuxem", mode:'copy' label 'small_mem' - conda "bioconda::pegasuspy demuxEM scanpy" + conda "bioconda::pegasuspy demuxEM conda-forge::scanpy" input: tuple val(sampleId), path(raw_hto_matrix_dir, stageAs: "hto_data_${params.hto_matrix_demuxem}"), From 4a8e1b4bd82a2b5250c353428a399c555379d5a0 Mon Sep 17 00:00:00 2001 From: Lukas Heumos Date: Wed, 7 Feb 2024 11:29:11 +0100 Subject: [PATCH 08/16] Remove whitespace --- modules/multi/gene_demultiplexing.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/multi/gene_demultiplexing.nf b/modules/multi/gene_demultiplexing.nf index 2567e7f..e2e5cf3 100644 --- a/modules/multi/gene_demultiplexing.nf +++ b/modules/multi/gene_demultiplexing.nf @@ -35,7 +35,6 @@ process subset_bam_to_comon_variants{ script: """ - bcftools sort ${vcf} -Oz -o sorted.vcf.gz filter_bam_file_for_popscle_dsc_pileup.sh ${sam} ${barcodes} sorted.vcf.gz ${sampleId}__filtered_bam_file.bam """ From 5a5a864601b88602ec5e2709aea24f94cc8ea556 Mon Sep 17 00:00:00 2001 From: zethson Date: Wed, 7 Feb 2024 11:53:07 +0100 Subject: [PATCH 09/16] Rufff Signed-off-by: zethson --- .pre-commit-config.yaml | 13 + bin/demuxem.py | 165 +++-- bin/generate_data.py | 81 ++- bin/gmm_demux_params.py | 47 +- bin/hashsolo.py | 107 +++- bin/solo_demul.py | 122 +++- bin/summary.py | 60 +- bin/summary_gene.py | 363 +++++++---- bin/summary_hash.py | 748 +++++++++++++++-------- conda/bff.yml | 1 - conda/condaenv.ucda_93c.requirements.txt | 3 - docs/source/conf.py | 25 +- subworkflows/HADGE.nf | 10 - 13 files changed, 1230 insertions(+), 515 deletions(-) delete mode 100644 conda/condaenv.ucda_93c.requirements.txt diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index dc5148e..25daafc 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,5 +1,18 @@ +fail_fast: false +default_language_version: + python: python3 +default_stages: + - commit + - push +minimum_pre_commit_version: 2.16.0 repos: - repo: https://github.com/pre-commit/mirrors-prettier rev: v2.7.0 hooks: - id: prettier + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.2.0 + hooks: + - id: ruff + args: [--fix, --exit-non-zero-on-fix, --unsafe-fixes] + - id: ruff-format diff --git a/bin/demuxem.py b/bin/demuxem.py index 4fac528..d03ae41 100755 --- a/bin/demuxem.py +++ b/bin/demuxem.py @@ -7,45 +7,110 @@ import pandas as pd import pegasusio as io -parser = argparse.ArgumentParser(description='Parser for DemuxEM - Demultiplexing') -parser.add_argument('--rna_matrix_dir', help= 'cellranger output folder which contains raw RNA count matrix in mtx format.') -parser.add_argument('--hto_matrix_dir', help= 'cellranger output folder which contains raw HTO (antibody tag) count matrix in mtx format.') -parser.add_argument('--randomState', help='Random seed set for reproducing results.', type=int, default=0) -parser.add_argument('--min_signal', help='Any cell/nucleus with less than min_signal hashtags from the signal will be marked as unknown.', type=float, default=10.0) -parser.add_argument('--min_num_genes', help='We only demultiplex cells/nuclei with at least of expressed genes.', type=int, default=100) -parser.add_argument('--min_num_umis', help='We only demultiplex cells/nuclei with at least of UMIs.', type=int, default=100) -parser.add_argument('--alpha', help='The Dirichlet prior concentration parameter (alpha) on samples. An alpha value < 1.0 will make the prior sparse.', type=float, default=0.0) -parser.add_argument('--alpha_noise', help='The Dirichlet prior concenration parameter on the background noise.', type=float, default=1.0) -parser.add_argument('--tol', help='Threshold used for the EM convergence.', type=float, default=1e-6) -parser.add_argument('--n_threads', help='Number of threads to use. Must be a positive integer.', type=int, default=1) -parser.add_argument('--filter_demuxem', help='Use the filter for RNA, True or False', default='True') -parser.add_argument('--generateGenderPlot', help='Generate violin plots using gender-specific genes (e.g. Xist). is a comma-separated list of gene names.', default='') -parser.add_argument('--objectOutDemuxem', help='Output name of demultiplexing results. All outputs will use it as the prefix.', default="demuxem_res") -parser.add_argument('--outputdir', help='Output directory') +parser = argparse.ArgumentParser(description="Parser for DemuxEM - Demultiplexing") +parser.add_argument( + "--rna_matrix_dir", + help="cellranger output folder which contains raw RNA count matrix in mtx format.", +) +parser.add_argument( + "--hto_matrix_dir", + help="cellranger output folder which contains raw HTO (antibody tag) count matrix in mtx format.", +) +parser.add_argument( + "--randomState", + help="Random seed set for reproducing results.", + type=int, + default=0, +) +parser.add_argument( + "--min_signal", + help="Any cell/nucleus with less than min_signal hashtags from the signal will be marked as unknown.", + type=float, + default=10.0, +) +parser.add_argument( + "--min_num_genes", + help="We only demultiplex cells/nuclei with at least of expressed genes.", + type=int, + default=100, +) +parser.add_argument( + "--min_num_umis", + help="We only demultiplex cells/nuclei with at least of UMIs.", + type=int, + default=100, +) +parser.add_argument( + "--alpha", + help="The Dirichlet prior concentration parameter (alpha) on samples. An alpha value < 1.0 will make the prior sparse.", + type=float, + default=0.0, +) +parser.add_argument( + "--alpha_noise", + help="The Dirichlet prior concenration parameter on the background noise.", + type=float, + default=1.0, +) +parser.add_argument( + "--tol", help="Threshold used for the EM convergence.", type=float, default=1e-6 +) +parser.add_argument( + "--n_threads", + help="Number of threads to use. Must be a positive integer.", + type=int, + default=1, +) +parser.add_argument( + "--filter_demuxem", help="Use the filter for RNA, True or False", default="True" +) +parser.add_argument( + "--generateGenderPlot", + help="Generate violin plots using gender-specific genes (e.g. Xist). is a comma-separated list of gene names.", + default="", +) +parser.add_argument( + "--objectOutDemuxem", + help="Output name of demultiplexing results. All outputs will use it as the prefix.", + default="demuxem_res", +) +parser.add_argument("--outputdir", help="Output directory") args = parser.parse_args() -param_list = [['rna_matrix_dir', args.rna_matrix_dir], ['hto_matrix_dir', args.hto_matrix_dir], ['randomState', args.randomState], ['min_signal', args.min_signal], ['min_num_genes', args.min_num_genes], ['min_num_umis', args.min_num_umis], ['alpha', args.alpha], ['alpha_noise', args.alpha_noise], ['tol', args.tol], ['n_threads', args.n_threads], ['generateGenderPlot', args.generateGenderPlot]] - -param_df = pd.DataFrame(param_list, columns=['Argument', 'Value']) +param_list = [ + ["rna_matrix_dir", args.rna_matrix_dir], + ["hto_matrix_dir", args.hto_matrix_dir], + ["randomState", args.randomState], + ["min_signal", args.min_signal], + ["min_num_genes", args.min_num_genes], + ["min_num_umis", args.min_num_umis], + ["alpha", args.alpha], + ["alpha_noise", args.alpha_noise], + ["tol", args.tol], + ["n_threads", args.n_threads], + ["generateGenderPlot", args.generateGenderPlot], +] -if __name__ == '__main__': +param_df = pd.DataFrame(param_list, columns=["Argument", "Value"]) + +if __name__ == "__main__": output_name = args.outputdir + "/" + args.objectOutDemuxem # load input rna data rna_data = sc.read_10x_mtx(args.rna_matrix_dir) - hashing_data = sc.read_10x_mtx(args.hto_matrix_dir,gex_only=False) + hashing_data = sc.read_10x_mtx(args.hto_matrix_dir, gex_only=False) rna = args.rna_matrix_dir filter = "" - if args.filter_demuxem.lower() in ['true', 't', 'yes', 'y', '1']: + if args.filter_demuxem.lower() in ["true", "t", "yes", "y", "1"]: filter = True - elif args.filter_demuxem.lower() in ['false', 'f', 'no', 'n', '0']: + elif args.filter_demuxem.lower() in ["false", "f", "no", "n", "0"]: filter = False else: - raise ValueError("Invalid boolean value: {}".format(value)) + raise ValueError(f"Invalid boolean value: {args.filter_demuxem.lower()}") # Filter the RNA matrix rna_data.obs["n_genes"] = rna_data.X.getnnz(axis=1) rna_data.obs["n_counts"] = rna_data.X.sum(axis=1).A1 - #data.obs["n_counts"] = rna_data.X.sum(axis=1).A1 - if(filter): + # data.obs["n_counts"] = rna_data.X.sum(axis=1).A1 + if filter: print("Filtering RNA matrix") obs_index = np.logical_and.reduce( ( @@ -56,25 +121,42 @@ rna_data._inplace_subset_obs(obs_index) # run demuxEM demuxEM.estimate_background_probs(hashing_data, random_state=args.randomState) - demuxEM.demultiplex(rna_data, hashing_data, min_signal=args.min_signal, alpha=args.alpha, alpha_noise=args.alpha_noise, tol=args.tol, n_threads=args.n_threads) + demuxEM.demultiplex( + rna_data, + hashing_data, + min_signal=args.min_signal, + alpha=args.alpha, + alpha_noise=args.alpha_noise, + tol=args.tol, + n_threads=args.n_threads, + ) # annotate raw matrix with demuxEM results demux_results = demuxEM.attach_demux_results(args.rna_matrix_dir, rna_data) # generate plots - demuxEM.plot_hto_hist(hashing_data, "hto_type", output_name + ".ambient_hashtag.hist.pdf", alpha=1.0) - demuxEM.plot_bar(hashing_data.uns["background_probs"], hashing_data.var_names, "Sample ID", - "Background probability", output_name + ".background_probabilities.bar.pdf",) - demuxEM.plot_hto_hist(hashing_data, "rna_type", output_name + ".real_content.hist.pdf", alpha=0.5) + demuxEM.plot_hto_hist( + hashing_data, "hto_type", output_name + ".ambient_hashtag.hist.pdf", alpha=1.0 + ) + demuxEM.plot_bar( + hashing_data.uns["background_probs"], + hashing_data.var_names, + "Sample ID", + "Background probability", + output_name + ".background_probabilities.bar.pdf", + ) + demuxEM.plot_hto_hist( + hashing_data, "rna_type", output_name + ".real_content.hist.pdf", alpha=0.5 + ) demuxEM.plot_rna_hist(rna_data, hashing_data, output_name + ".rna_demux.hist.pdf") - + if len(args.generateGenderPlot) > 0: rna_data.matrices["raw.X"] = rna_data.X.copy() rna_data.as_float() scale = 1e5 / rna_data.X.sum(axis=1).A1 - rna_data.X.data *= np.repeat(scale, np.diff(data.X.indptr)) + rna_data.X.data *= np.repeat(scale, np.diff(rna_data.X.indptr)) rna_data.X.data = np.log1p(rna_data.X.data) for gene_name in args.generateGenderPlot: - plot_gene_violin( + demuxEM.plot_gene_violin( rna_data, gene_name, "{output_name}.{gene_name}.violin.pdf".format( @@ -91,16 +173,21 @@ print("total\t{}".format(rna_data.shape[0])) for name, value in rna_data.obs["demux_type"].value_counts().items(): print("{}\t{}".format(name, value)) - summary = rna_data.obs["demux_type"].value_counts().rename_axis('classification').reset_index(name='counts') + summary = ( + rna_data.obs["demux_type"] + .value_counts() + .rename_axis("classification") + .reset_index(name="counts") + ) total = ["total", rna_data.shape[0]] summary.loc[len(summary)] = total summary.to_csv(output_name + "_summary.csv", index=False) - param_df.fillna("None",inplace=True) + param_df.fillna("None", inplace=True) param_df.to_csv(args.outputdir + "/params.csv", index=False) - - rna_data.obs.assignment.replace(np.nan,'negative', inplace=True) + + rna_data.obs.assignment.replace(np.nan, "negative", inplace=True) hashtags = hashing_data.var.index.tolist() hashtags = hashtags + ["negative"] - toreplace = [ht for ht in rna_data.obs['assignment'].unique() if ht not in hashtags] - rna_data.obs.assignment.replace(toreplace,'doublet', inplace=True) + toreplace = [ht for ht in rna_data.obs["assignment"].unique() if ht not in hashtags] + rna_data.obs.assignment.replace(toreplace, "doublet", inplace=True) rna_data.obs.to_csv(output_name + "_obs.csv") diff --git a/bin/generate_data.py b/bin/generate_data.py index bc38485..146f813 100755 --- a/bin/generate_data.py +++ b/bin/generate_data.py @@ -3,40 +3,73 @@ import os import scanpy as sc import argparse +from mudata import MuData -parser = argparse.ArgumentParser(description="Parameters for generating anndata and mudata") -parser.add_argument("--assignment", help="Folder which contains cSV file with demultiplexing assignment", default=None) -parser.add_argument("--generate_anndata", help="Generate anndata", action='store_true') -parser.add_argument("--generate_mudata", help="Generate mudata", action='store_true') -parser.add_argument("--read_rna_mtx", help="10x-Genomics-formatted mtx directory for gene expression", default=None) -parser.add_argument("--read_hto_mtx", help="10x-Genomics-formatted mtx directory for HTO expression", default=None) +parser = argparse.ArgumentParser( + description="Parameters for generating anndata and mudata" +) +parser.add_argument( + "--assignment", + help="Folder which contains cSV file with demultiplexing assignment", + default=None, +) +parser.add_argument("--generate_anndata", help="Generate anndata", action="store_true") +parser.add_argument("--generate_mudata", help="Generate mudata", action="store_true") +parser.add_argument( + "--read_rna_mtx", + help="10x-Genomics-formatted mtx directory for gene expression", + default=None, +) +parser.add_argument( + "--read_hto_mtx", + help="10x-Genomics-formatted mtx directory for HTO expression", + default=None, +) args = parser.parse_args() -if __name__ == '__main__': +if __name__ == "__main__": if args.generate_anndata: adata = sc.read_10x_mtx(args.read_rna_mtx) - assignment_dir = os.path.join(args.assignment, - [filename for filename in os.listdir(args.assignment) if filename == "all_assignment_after_match.csv"][0]) + assignment_dir = os.path.join( + args.assignment, + [ + filename + for filename in os.listdir(args.assignment) + if filename == "all_assignment_after_match.csv" + ][0], + ) - assignment = pd.read_csv(assignment_dir, index_col = 0) - adata.obs = adata.obs.merge(assignment, left_index=True, right_index=True, how='left') - adata.obs= adata.obs.fillna("negative") - adata.obs[adata.obs.columns[0]]= adata.obs[adata.obs.columns[0]].astype(str) - adata.obs[adata.obs.columns[1]]= adata.obs[adata.obs.columns[1]].astype(str) + assignment = pd.read_csv(assignment_dir, index_col=0) + adata.obs = adata.obs.merge( + assignment, left_index=True, right_index=True, how="left" + ) + adata.obs = adata.obs.fillna("negative") + adata.obs[adata.obs.columns[0]] = adata.obs[adata.obs.columns[0]].astype(str) + adata.obs[adata.obs.columns[1]] = adata.obs[adata.obs.columns[1]].astype(str) adata.write("adata_with_donor_matching.h5ad") - + if args.generate_mudata: rna_data = sc.read_10x_mtx(args.read_rna_mtx) hto_data = sc.read_10x_mtx(args.read_hto_mtx, gex_only=False) - assignment_dir = os.path.join(args.assignment, - [filename for filename in os.listdir(args.assignment) if filename == "all_assignment_after_match.csv"][0]) + assignment_dir = os.path.join( + args.assignment, + [ + filename + for filename in os.listdir(args.assignment) + if filename == "all_assignment_after_match.csv" + ][0], + ) - assignment = pd.read_csv(assignment_dir, index_col = 0) - mudata = MuData({"rna": rna_data, "hto": hto_data }) + assignment = pd.read_csv(assignment_dir, index_col=0) + mudata = MuData({"rna": rna_data, "hto": hto_data}) - mudata['rna'].obs = mudata['rna'].obs.merge(args.assignment, left_index=True, right_index=True, how='left') - mudata['rna'].obs.rename(columns={mudata['rna'].obs.columns[0]: 'donor'}, inplace=True) - mudata['rna'].obs.donor = mudata['rna'].obs.donor.fillna("negative") - mudata['rna'].obs.donor = mudata['rna'].obs.donor.astype(str) + mudata["rna"].obs = mudata["rna"].obs.merge( + args.assignment, left_index=True, right_index=True, how="left" + ) + mudata["rna"].obs.rename( + columns={mudata["rna"].obs.columns[0]: "donor"}, inplace=True + ) + mudata["rna"].obs.donor = mudata["rna"].obs.donor.fillna("negative") + mudata["rna"].obs.donor = mudata["rna"].obs.donor.astype(str) mudata.update() - mudata.write("mudata_with_donor_matching.h5mu") + mudata.write("mudata_with_donor_matching.h5mu") diff --git a/bin/gmm_demux_params.py b/bin/gmm_demux_params.py index ee6f473..dc256a2 100755 --- a/bin/gmm_demux_params.py +++ b/bin/gmm_demux_params.py @@ -5,19 +5,50 @@ # Create a parser parser = argparse.ArgumentParser(description="Parameters for Demuxmix") parser.add_argument("--path_hto", help="cell hashing path, filtered HTO matrix") -parser.add_argument("--hto_name_gmm", help="a list of sample tags (HTOs) separated by ',' without whitespace") -parser.add_argument("--summary", help="Generate the statistical summary of the dataset. Requires an estimated total number of cells in the assay as input") +parser.add_argument( + "--hto_name_gmm", + help="a list of sample tags (HTOs) separated by ',' without whitespace", +) +parser.add_argument( + "--summary", + help="Generate the statistical summary of the dataset. Requires an estimated total number of cells in the assay as input", +) parser.add_argument("--report", help="Name for the summary file generated by GMM") parser.add_argument("--mode", help="mode csv or tsv") -parser.add_argument("--extract", help="Names of the sample barcoding tag(s) to extract, separated by ','. Joint tags are linked with '+'.", default='None') -parser.add_argument("--threshold_gmm", help="Provide the confidence threshold value. Requires a float in (0,1). Default value: 0.8.") +parser.add_argument( + "--extract", + help="Names of the sample barcoding tag(s) to extract, separated by ','. Joint tags are linked with '+'.", + default="None", +) +parser.add_argument( + "--threshold_gmm", + help="Provide the confidence threshold value. Requires a float in (0,1). Default value: 0.8.", +) parser.add_argument("--outputdir", help="Output directory") args = parser.parse_args() # Create parameters DataFrame -params = pd.DataFrame({ - "Argument": ["path_hto", "hto_name_gmm", "summary", "report", "mode", "extract", "threshold_gmm"], - "Value": [args.path_hto, args.hto_name_gmm, args.summary, args.report, args.mode, args.extract, args.threshold_gmm] -}) +params = pd.DataFrame( + { + "Argument": [ + "path_hto", + "hto_name_gmm", + "summary", + "report", + "mode", + "extract", + "threshold_gmm", + ], + "Value": [ + args.path_hto, + args.hto_name_gmm, + args.summary, + args.report, + args.mode, + args.extract, + args.threshold_gmm, + ], + } +) params.to_csv(f"{args.outputdir}/params.csv", index=False) diff --git a/bin/hashsolo.py b/bin/hashsolo.py index 9201761..b76b9d5 100755 --- a/bin/hashsolo.py +++ b/bin/hashsolo.py @@ -5,48 +5,97 @@ import argparse import pandas as pd -parser = argparse.ArgumentParser(description='Parser for HashSolo - Demultiplexing') +parser = argparse.ArgumentParser(description="Parser for HashSolo - Demultiplexing") -parser.add_argument('--hto_data', help='Input directory containing hashing count matrix in 10x mtx format.') -parser.add_argument('--priors', metavar='N', type=float, nargs=3, - help='a list of prior for each hypothesis. \ +parser.add_argument( + "--hto_data", + help="Input directory containing hashing count matrix in 10x mtx format.", +) +parser.add_argument( + "--priors", + metavar="N", + type=float, + nargs=3, + help="a list of prior for each hypothesis. \ The first element is prior for the negative hypothesis, \ - second for the singlet hypothesis, third element for the doublet hypothesis', - default=[0.01, 0.8, 0.19]) -parser.add_argument('--pre_existing_clusters', - help='column in cell_hashing_adata.obs for how to break up demultiplexing', - type=str, default=None) -parser.add_argument('--clustering_data', help='input directory containing transcriptomic data in 10x mtx format.', type=str, default=None) -parser.add_argument('--number_of_noise_barcodes', help='Number of barcodes to use to create noise distribution', type=int, default=None) -parser.add_argument('--assignmentOutHashSolo', help='prefix name for CSV results', type=str, default="hash_solo") -parser.add_argument('--plotOutHashSolo', help='prefix name for the JPG plot', type=str, default="hash_solo_plot") -parser.add_argument('--outputdir', help='Output directory') + second for the singlet hypothesis, third element for the doublet hypothesis", + default=[0.01, 0.8, 0.19], +) +parser.add_argument( + "--pre_existing_clusters", + help="column in cell_hashing_adata.obs for how to break up demultiplexing", + type=str, + default=None, +) +parser.add_argument( + "--clustering_data", + help="input directory containing transcriptomic data in 10x mtx format.", + type=str, + default=None, +) +parser.add_argument( + "--number_of_noise_barcodes", + help="Number of barcodes to use to create noise distribution", + type=int, + default=None, +) +parser.add_argument( + "--assignmentOutHashSolo", + help="prefix name for CSV results", + type=str, + default="hash_solo", +) +parser.add_argument( + "--plotOutHashSolo", + help="prefix name for the JPG plot", + type=str, + default="hash_solo_plot", +) +parser.add_argument("--outputdir", help="Output directory") args = parser.parse_args() -param_list = [['hto_data', args.hto_data], ['prior_negative', args.priors[0]], ['prior_singlet', args.priors[1]], ['prior_doublet', args.priors[2]], ['pre_existing_clusters', args.pre_existing_clusters], ['clustering_data', args.clustering_data], ['number_of_noise_barcodes', args.number_of_noise_barcodes]] - -param_df = pd.DataFrame(param_list, columns=['Argument', 'Value']) +param_list = [ + ["hto_data", args.hto_data], + ["prior_negative", args.priors[0]], + ["prior_singlet", args.priors[1]], + ["prior_doublet", args.priors[2]], + ["pre_existing_clusters", args.pre_existing_clusters], + ["clustering_data", args.clustering_data], + ["number_of_noise_barcodes", args.number_of_noise_barcodes], +] -if __name__ == '__main__': +param_df = pd.DataFrame(param_list, columns=["Argument", "Value"]) + +if __name__ == "__main__": cell_hashing_data = sc.read_10x_mtx(args.hto_data, gex_only=False) if args.clustering_data is not None: - trans_data = sc.read_10x_mtx(args.clustering_data) - trans_data.var_names_make_unique() - print("--------------------Get data-------------------------------") - hashsolo.hashsolo(cell_hashing_data, priors=args.priors, clustering_data=trans_data, pre_existing_clusters=args.pre_existing_clusters, number_of_noise_barcodes=args.number_of_noise_barcodes) + trans_data = sc.read_10x_mtx(args.clustering_data) + trans_data.var_names_make_unique() + print("--------------------Get data-------------------------------") + hashsolo.hashsolo( + cell_hashing_data, + priors=args.priors, + clustering_data=trans_data, + pre_existing_clusters=args.pre_existing_clusters, + number_of_noise_barcodes=args.number_of_noise_barcodes, + ) else: - print("--------------------Get data-------------------------------") - hashsolo.hashsolo(cell_hashing_data, priors=args.priors, pre_existing_clusters=args.pre_existing_clusters, number_of_noise_barcodes=args.number_of_noise_barcodes) + print("--------------------Get data-------------------------------") + hashsolo.hashsolo( + cell_hashing_data, + priors=args.priors, + pre_existing_clusters=args.pre_existing_clusters, + number_of_noise_barcodes=args.number_of_noise_barcodes, + ) print("--------------------Finished demultiplexing-------------------------------") print("------------------- Following Files are saved ----------------------------") print(args.assignmentOutHashSolo + "_res.csv") print(args.plotOutHashSolo + ".jpg") print("params.csv") - cell_hashing_data.obs.to_csv(args.outputdir + "/" + args.assignmentOutHashSolo + "_res.csv") + cell_hashing_data.obs.to_csv( + args.outputdir + "/" + args.assignmentOutHashSolo + "_res.csv" + ) hashsolo.plot_qc_checks_cell_hashing(cell_hashing_data) plt.savefig(args.outputdir + "/" + args.plotOutHashSolo + ".jpg", dpi=400) - param_df.fillna("None",inplace=True) + param_df.fillna("None", inplace=True) param_df.to_csv(args.outputdir + "/params.csv", index=False) - - - diff --git a/bin/solo_demul.py b/bin/solo_demul.py index b726180..72b247f 100755 --- a/bin/solo_demul.py +++ b/bin/solo_demul.py @@ -4,47 +4,107 @@ import argparse import pandas as pd -parser = argparse.ArgumentParser(description='Parser for SOLO - Doublet finding') +parser = argparse.ArgumentParser(description="Parser for SOLO - Doublet finding") # arguments mainly for solo -parser.add_argument('--rna_matrix_dir', help='Input folder to RNA expression matrix in 10x format.') +parser.add_argument( + "--rna_matrix_dir", help="Input folder to RNA expression matrix in 10x format." +) # for solo.train() -parser.add_argument('--max_epochs', help='Number of epochs to train for', type=int, default=400) -parser.add_argument('--lr', help='Learning rate for optimization.', type=float, default=0.001) -parser.add_argument('--train_size', help='Size of training set in the range between 0 and 1.', type=float, default=0.9) -parser.add_argument('--validation_size', - help='Size of the test set. If None, defaults to 1 - train_size. If train_size + validation_size < 1, the remaining cells belong to a test set.', - type=float, default=None) -parser.add_argument('--batch_size', help='Minibatch size to use during training.', type=int, default=128) -parser.add_argument('--early_stopping', help='Adds callback for early stopping on validation_loss.', type=lambda x: (str(x).lower() in ['true']), default=True) -parser.add_argument('--early_stopping_patience', help='Number of times early stopping metric can not improve over early_stopping_min_delta.', type=int, default=30) -parser.add_argument('--early_stopping_min_delta', help='Threshold for counting an epoch towards patience train().', type=float, default=0.0) +parser.add_argument( + "--max_epochs", help="Number of epochs to train for", type=int, default=400 +) +parser.add_argument( + "--lr", help="Learning rate for optimization.", type=float, default=0.001 +) +parser.add_argument( + "--train_size", + help="Size of training set in the range between 0 and 1.", + type=float, + default=0.9, +) +parser.add_argument( + "--validation_size", + help="Size of the test set. If None, defaults to 1 - train_size. If train_size + validation_size < 1, the remaining cells belong to a test set.", + type=float, + default=None, +) +parser.add_argument( + "--batch_size", help="Minibatch size to use during training.", type=int, default=128 +) +parser.add_argument( + "--early_stopping", + help="Adds callback for early stopping on validation_loss.", + type=lambda x: (str(x).lower() in ["true"]), + default=True, +) +parser.add_argument( + "--early_stopping_patience", + help="Number of times early stopping metric can not improve over early_stopping_min_delta.", + type=int, + default=30, +) +parser.add_argument( + "--early_stopping_min_delta", + help="Threshold for counting an epoch towards patience train().", + type=float, + default=0.0, +) # for solo.predict() -parser.add_argument('--soft', help='Return probabilities instead of class label', default=False, type=lambda x: (str(x).lower() in ['true'])) -parser.add_argument('--include_simulated_doublets', help='Return probabilities for simulated doublets as well.', - type=lambda x: (str(x).lower() in ['true']), default=False) -parser.add_argument('--assignmentOutSolo', help='Name for the CSV file containing the output of Solo prediction', - default="solo_prediction", type=str) -parser.add_argument('--outputdir', help='Output directory') +parser.add_argument( + "--soft", + help="Return probabilities instead of class label", + default=False, + type=lambda x: (str(x).lower() in ["true"]), +) +parser.add_argument( + "--include_simulated_doublets", + help="Return probabilities for simulated doublets as well.", + type=lambda x: (str(x).lower() in ["true"]), + default=False, +) +parser.add_argument( + "--assignmentOutSolo", + help="Name for the CSV file containing the output of Solo prediction", + default="solo_prediction", + type=str, +) +parser.add_argument("--outputdir", help="Output directory") args = parser.parse_args() -param_list = [['rna_matrix_dir', args.rna_matrix_dir], ['max_epochs', args.max_epochs], ['lr', args.lr], ['train_size', args.train_size], ['validation_size', args.validation_size], ['batch_size', args.batch_size], ['early_stopping', args.early_stopping], ['early_stopping_patience', args.early_stopping_patience], ['early_stopping_min_delta', args.early_stopping_min_delta], ['soft', args.soft], ['include_simulated_doublets', args.include_simulated_doublets]] - -param_df = pd.DataFrame(param_list, columns=['Argument', 'Value']) +param_list = [ + ["rna_matrix_dir", args.rna_matrix_dir], + ["max_epochs", args.max_epochs], + ["lr", args.lr], + ["train_size", args.train_size], + ["validation_size", args.validation_size], + ["batch_size", args.batch_size], + ["early_stopping", args.early_stopping], + ["early_stopping_patience", args.early_stopping_patience], + ["early_stopping_min_delta", args.early_stopping_min_delta], + ["soft", args.soft], + ["include_simulated_doublets", args.include_simulated_doublets], +] -if __name__ == '__main__': +param_df = pd.DataFrame(param_list, columns=["Argument", "Value"]) + +if __name__ == "__main__": adata = sc.read_10x_mtx(args.rna_matrix_dir) scvi.model.SCVI.setup_anndata(adata) vae = scvi.model.SCVI(adata) vae.train() solo = scvi.external.SOLO.from_scvi_model(vae) - solo.train(max_epochs=args.max_epochs, lr=args.lr, train_size=args.train_size, - validation_size=args.validation_size, batch_size=args.batch_size, - early_stopping=args.early_stopping, - early_stopping_patience=args.early_stopping_patience, - early_stopping_min_delta=args.early_stopping_min_delta) - prediction = solo.predict(args.soft, include_simulated_doublets=args.include_simulated_doublets) + solo.train( + max_epochs=args.max_epochs, + lr=args.lr, + train_size=args.train_size, + validation_size=args.validation_size, + batch_size=args.batch_size, + early_stopping=args.early_stopping, + early_stopping_patience=args.early_stopping_patience, + early_stopping_min_delta=args.early_stopping_min_delta, + ) + prediction = solo.predict( + args.soft, include_simulated_doublets=args.include_simulated_doublets + ) prediction.to_csv(args.outputdir + "/" + args.assignmentOutSolo + "_res.csv") param_df.to_csv(args.outputdir + "/params.csv", index=False) - - - diff --git a/bin/summary.py b/bin/summary.py index 6a4619e..0484c74 100755 --- a/bin/summary.py +++ b/bin/summary.py @@ -4,10 +4,19 @@ import pandas as pd parser = argparse.ArgumentParser(description="Parameters for summarizing results") -parser.add_argument("--gene_demulti", help="Folder containing output files of genetic demultiplexing pipeline", default=None) -parser.add_argument("--hash_demulti", help="Folder containing output files of hashing demultiplexing pipeline", default=None) +parser.add_argument( + "--gene_demulti", + help="Folder containing output files of genetic demultiplexing pipeline", + default=None, +) +parser.add_argument( + "--hash_demulti", + help="Folder containing output files of hashing demultiplexing pipeline", + default=None, +) args = parser.parse_args() + def merge_dataframes(dataframes): merged_df = pd.DataFrame() for df in dataframes: @@ -17,29 +26,50 @@ def merge_dataframes(dataframes): merged_df = pd.merge(merged_df, df, on="Barcode", how="outer") return merged_df -if __name__ == '__main__': + +if __name__ == "__main__": if not os.path.exists("summary"): os.makedirs("summary") # Assignments - assignment_gene = [os.path.join(args.gene_demulti, gene_file) for gene_file in os.listdir(args.gene_demulti) \ - if gene_file.endswith("_assignment_all.csv") and not gene_file.startswith('.')][0] + assignment_gene = [ + os.path.join(args.gene_demulti, gene_file) + for gene_file in os.listdir(args.gene_demulti) + if gene_file.endswith("_assignment_all.csv") and not gene_file.startswith(".") + ][0] assignment_gene = pd.read_csv(assignment_gene, dtype=str) - assignment_hash = [os.path.join(args.hash_demulti, hash_file) for hash_file in os.listdir(args.hash_demulti) \ - if hash_file.endswith("_assignment_all.csv") and not hash_file.startswith('.')][0] + assignment_hash = [ + os.path.join(args.hash_demulti, hash_file) + for hash_file in os.listdir(args.hash_demulti) + if hash_file.endswith("_assignment_all.csv") and not hash_file.startswith(".") + ][0] assignment_hash = pd.read_csv(assignment_hash, dtype=str) assignment_all = merge_dataframes([assignment_gene, assignment_hash]) - assignment_all= assignment_all.replace({"DBL": "doublet", "AMB": "negative"}) - assignment_all.to_csv("summary/assignment_all_genetic_and_hash.csv", index=False, sep='\t') + assignment_all = assignment_all.replace({"DBL": "doublet", "AMB": "negative"}) + assignment_all.to_csv( + "summary/assignment_all_genetic_and_hash.csv", index=False, sep="\t" + ) # Classifications - classification_gene = [os.path.join(args.gene_demulti, gene_file) for gene_file in os.listdir(args.gene_demulti) \ - if gene_file.endswith("_classification_all.csv") and not gene_file.startswith('.')][0] + classification_gene = [ + os.path.join(args.gene_demulti, gene_file) + for gene_file in os.listdir(args.gene_demulti) + if gene_file.endswith("_classification_all.csv") + and not gene_file.startswith(".") + ][0] classification_gene = pd.read_csv(classification_gene, dtype=str) - classification_hash = [os.path.join(args.hash_demulti, hash_file) for hash_file in os.listdir(args.hash_demulti) \ - if hash_file.endswith("_classification_all.csv") and not hash_file.startswith('.')][0] + classification_hash = [ + os.path.join(args.hash_demulti, hash_file) + for hash_file in os.listdir(args.hash_demulti) + if hash_file.endswith("_classification_all.csv") + and not hash_file.startswith(".") + ][0] classification_hash = pd.read_csv(classification_hash, dtype=str) classification_all = merge_dataframes([classification_gene, classification_hash]) - classification_all = classification_all.replace({"SNG": "singlet", "DBL": "doublet", "AMB": "negative"}) - classification_all.to_csv("summary/classification_all_genetic_and_hash.csv", index=False, sep='\t') \ No newline at end of file + classification_all = classification_all.replace( + {"SNG": "singlet", "DBL": "doublet", "AMB": "negative"} + ) + classification_all.to_csv( + "summary/classification_all_genetic_and_hash.csv", index=False, sep="\t" + ) diff --git a/bin/summary_gene.py b/bin/summary_gene.py index 0389d7b..c6ca8cf 100755 --- a/bin/summary_gene.py +++ b/bin/summary_gene.py @@ -8,51 +8,98 @@ parser = argparse.ArgumentParser(description="Parameters for summary process") -parser.add_argument("--demuxlet", help="Folder containing output files of Demuxlet", default=None) -parser.add_argument("--freemuxlet", help="Folder containing output files of Freemuxlet", default=None) -parser.add_argument("--vireo", help="Folder containing output files of Vireo", default=None) -parser.add_argument("--souporcell", help="Folder containing output files of Souporcell", default=None) -parser.add_argument("--scsplit", help="Folder containing output files of scSplit", default=None) -parser.add_argument("--generate_anndata", help="Generate anndata", action='store_true') -parser.add_argument("--generate_mudata", help="Generate mudata", action='store_true') -parser.add_argument("--read_rna_mtx", help="10x-Genomics-formatted mtx directory for gene expression", default=None) -parser.add_argument("--read_hto_mtx", help="10x-Genomics-formatted mtx directory for HTO expression", default=None) +parser.add_argument( + "--demuxlet", help="Folder containing output files of Demuxlet", default=None +) +parser.add_argument( + "--freemuxlet", help="Folder containing output files of Freemuxlet", default=None +) +parser.add_argument( + "--vireo", help="Folder containing output files of Vireo", default=None +) +parser.add_argument( + "--souporcell", help="Folder containing output files of Souporcell", default=None +) +parser.add_argument( + "--scsplit", help="Folder containing output files of scSplit", default=None +) +parser.add_argument("--generate_anndata", help="Generate anndata", action="store_true") +parser.add_argument("--generate_mudata", help="Generate mudata", action="store_true") +parser.add_argument( + "--read_rna_mtx", + help="10x-Genomics-formatted mtx directory for gene expression", + default=None, +) +parser.add_argument( + "--read_hto_mtx", + help="10x-Genomics-formatted mtx directory for HTO expression", + default=None, +) args = parser.parse_args() + def demuxlet_summary(demuxlet_res, raw_adata, raw_mudata): assign = [] params = [] for x in demuxlet_res: - obs_res_dir = [file for file in os.listdir(x) if file.endswith('.best')][0] - obs_res = pd.read_csv(os.path.join(x, obs_res_dir), sep='\t') + obs_res_dir = [file for file in os.listdir(x) if file.endswith(".best")][0] + obs_res = pd.read_csv(os.path.join(x, obs_res_dir), sep="\t") obs_res = obs_res.iloc[:, [1, 4, 5]] - obs_res['Assignment'] = np.where(obs_res['BEST.GUESS'].str.split(',').str[0] == obs_res['BEST.GUESS'].str.split(',').str[1], - obs_res['BEST.GUESS'].str.split(',').str[0], "doublet") - obs_res['Assignment'] = np.where(obs_res['DROPLET.TYPE'] == 'AMB', 'negative', obs_res['Assignment']) - obs_res.rename(columns={"BARCODE": "Barcode", "Assignment": os.path.basename(x)}, inplace=True) - obs_res.set_index('Barcode', inplace=True) + obs_res["Assignment"] = np.where( + obs_res["BEST.GUESS"].str.split(",").str[0] + == obs_res["BEST.GUESS"].str.split(",").str[1], + obs_res["BEST.GUESS"].str.split(",").str[0], + "doublet", + ) + obs_res["Assignment"] = np.where( + obs_res["DROPLET.TYPE"] == "AMB", "negative", obs_res["Assignment"] + ) + obs_res.rename( + columns={"BARCODE": "Barcode", "Assignment": os.path.basename(x)}, + inplace=True, + ) + obs_res.set_index("Barcode", inplace=True) demuxlet_assign = obs_res[[os.path.basename(x)]] - + if raw_adata is not None: adata = raw_adata.copy() - adata.obs = adata.obs.merge(demuxlet_assign, left_index=True, right_index=True, how='left') - adata.obs.rename(columns={adata.obs.columns[0]: 'donor'}, inplace=True) + adata.obs = adata.obs.merge( + demuxlet_assign, left_index=True, right_index=True, how="left" + ) + adata.obs.rename(columns={adata.obs.columns[0]: "donor"}, inplace=True) adata.obs.donor = adata.obs.donor.fillna("negative") adata.obs.donor = adata.obs.donor.astype(str) - adata.write("genetic_summary/adata/adata_with_"+os.path.basename(x)+".h5ad") + adata.write( + "genetic_summary/adata/adata_with_" + os.path.basename(x) + ".h5ad" + ) assign.append(demuxlet_assign) if raw_mudata is not None: mudata = raw_mudata.copy() - mudata['rna'].obs = mudata['rna'].obs.merge(demuxlet_assign, left_index=True, right_on='Barcode', how='left').set_index('Barcode') - mudata['rna'].obs.rename(columns={mudata['rna'].obs.columns[0]: 'donor'}, inplace=True) - mudata['rna'].obs.donor = mudata['rna'].obs.donor.fillna("negative") - mudata['rna'].obs.donor = mudata['rna'].obs.donor.astype(str) + mudata["rna"].obs = ( + mudata["rna"] + .obs.merge( + demuxlet_assign, left_index=True, right_on="Barcode", how="left" + ) + .set_index("Barcode") + ) + mudata["rna"].obs.rename( + columns={mudata["rna"].obs.columns[0]: "donor"}, inplace=True + ) + mudata["rna"].obs.donor = mudata["rna"].obs.donor.fillna("negative") + mudata["rna"].obs.donor = mudata["rna"].obs.donor.astype(str) mudata.update() - mudata.write("genetic_summary/mudata/mudata_with_"+ os.path.basename(x)+".h5mu") - - params_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename.endswith("params.csv")][0]) + mudata.write( + "genetic_summary/mudata/mudata_with_" + os.path.basename(x) + ".h5mu" + ) + + params_dir = os.path.join( + x, + [filename for filename in os.listdir(x) if filename.endswith("params.csv")][ + 0 + ], + ) params_res = pd.read_csv(params_dir, keep_default_na=False, index_col=0) params_res.columns = [os.path.basename(x)] params.append(params_res) @@ -67,41 +114,72 @@ def demuxlet_summary(demuxlet_res, raw_adata, raw_mudata): params = pd.concat(params, axis=1) params.to_csv("genetic_summary/demuxlet_params.csv") + def freemuxlet_summary(freemuxlet_res, raw_adata, raw_mudata): assign = [] params = [] for x in freemuxlet_res: - obs_res_dir = [file for file in os.listdir(x) if file.endswith('.clust1.samples.gz')][0] - obs_res = pd.read_csv(os.path.join(x, obs_res_dir), sep='\t') + obs_res_dir = [ + file for file in os.listdir(x) if file.endswith(".clust1.samples.gz") + ][0] + obs_res = pd.read_csv(os.path.join(x, obs_res_dir), sep="\t") obs_res = obs_res.iloc[:, [1, 4, 5]] - obs_res['Assignment'] = np.where(obs_res['BEST.GUESS'].str.split(',').str[0] == obs_res['BEST.GUESS'].str.split(',').str[1], - obs_res['BEST.GUESS'].str.split(',').str[0], "doublet") - obs_res['Assignment'] = np.where(obs_res['DROPLET.TYPE'] == 'AMB', 'negative', obs_res['Assignment']) - obs_res.rename(columns={"BARCODE": "Barcode", "Assignment": os.path.basename(x)}, inplace=True) - obs_res.set_index('Barcode', inplace=True) + obs_res["Assignment"] = np.where( + obs_res["BEST.GUESS"].str.split(",").str[0] + == obs_res["BEST.GUESS"].str.split(",").str[1], + obs_res["BEST.GUESS"].str.split(",").str[0], + "doublet", + ) + obs_res["Assignment"] = np.where( + obs_res["DROPLET.TYPE"] == "AMB", "negative", obs_res["Assignment"] + ) + obs_res.rename( + columns={"BARCODE": "Barcode", "Assignment": os.path.basename(x)}, + inplace=True, + ) + obs_res.set_index("Barcode", inplace=True) freemuxlet_assign = obs_res[[os.path.basename(x)]] - + if raw_adata is not None: adata = raw_adata.copy() - adata.obs = adata.obs.merge(freemuxlet_assign, left_index=True, right_index=True, how='left') - adata.obs.rename(columns={adata.obs.columns[0]: 'donor'}, inplace=True) + adata.obs = adata.obs.merge( + freemuxlet_assign, left_index=True, right_index=True, how="left" + ) + adata.obs.rename(columns={adata.obs.columns[0]: "donor"}, inplace=True) adata.obs.donor = adata.obs.donor.fillna("negative") adata.obs.donor = adata.obs.donor.astype(str) - adata.write("genetic_summary/adata/adata_with_"+ os.path.basename(x)+".h5ad") + adata.write( + "genetic_summary/adata/adata_with_" + os.path.basename(x) + ".h5ad" + ) if raw_mudata is not None: mudata = raw_mudata.copy() - mudata['rna'].obs = mudata['rna'].obs.merge(freemuxlet_assign, left_index=True, right_on='Barcode', how='left').set_index('Barcode') - mudata['rna'].obs.rename(columns={mudata['rna'].obs.columns[0]: 'donor'}, inplace=True) - mudata['rna'].obs.donor = mudata['rna'].obs.donor.fillna("negative") - mudata['rna'].obs.donor = mudata['rna'].obs.donor.astype(str) + mudata["rna"].obs = ( + mudata["rna"] + .obs.merge( + freemuxlet_assign, left_index=True, right_on="Barcode", how="left" + ) + .set_index("Barcode") + ) + mudata["rna"].obs.rename( + columns={mudata["rna"].obs.columns[0]: "donor"}, inplace=True + ) + mudata["rna"].obs.donor = mudata["rna"].obs.donor.fillna("negative") + mudata["rna"].obs.donor = mudata["rna"].obs.donor.astype(str) mudata.update() - mudata.write("genetic_summary/mudata/mudata_with_"+ os.path.basename(x)+".h5mu") + mudata.write( + "genetic_summary/mudata/mudata_with_" + os.path.basename(x) + ".h5mu" + ) assign.append(freemuxlet_assign) - params_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename.endswith("params.csv")][0]) + params_dir = os.path.join( + x, + [filename for filename in os.listdir(x) if filename.endswith("params.csv")][ + 0 + ], + ) params_res = pd.read_csv(params_dir, keep_default_na=False, index_col=0) params_res.columns = [os.path.basename(x)] params.append(params_res) @@ -116,6 +194,7 @@ def freemuxlet_summary(freemuxlet_res, raw_adata, raw_mudata): params = pd.concat(params, axis=1) params.to_csv("genetic_summary/freemuxlet_params.csv") + def souporcell_summary(souporcell_res, raw_adata, raw_mudata): assign = [] params = [] @@ -124,34 +203,54 @@ def souporcell_summary(souporcell_res, raw_adata, raw_mudata): for root, dirs, files in os.walk(x): if "clusters.tsv" in files: obs_res_dir = os.path.join(root, "clusters.tsv") - obs_res = pd.read_csv(os.path.join(x, obs_res_dir), sep='\t') + obs_res = pd.read_csv(os.path.join(x, obs_res_dir), sep="\t") obs_res = obs_res.iloc[:, 0:3] - obs_res.loc[obs_res['status'] == 'doublet', 'assignment'] = 'doublet' - obs_res.loc[obs_res['status'] == 'unassigned', 'assignment'] = 'negative' - obs_res.rename(columns={'barcode': 'Barcode', 'assignment': os.path.basename(x)}, inplace=True) - obs_res.set_index('Barcode', inplace=True) + obs_res.loc[obs_res["status"] == "doublet", "assignment"] = "doublet" + obs_res.loc[obs_res["status"] == "unassigned", "assignment"] = "negative" + obs_res.rename( + columns={"barcode": "Barcode", "assignment": os.path.basename(x)}, + inplace=True, + ) + obs_res.set_index("Barcode", inplace=True) obs_res = obs_res[[os.path.basename(x)]] if raw_adata is not None: adata = raw_adata.copy() - adata.obs = adata.obs.merge(obs_res, left_index=True, right_index=True, how='left') - adata.obs.rename(columns={adata.obs.columns[0]: 'donor'}, inplace=True) + adata.obs = adata.obs.merge( + obs_res, left_index=True, right_index=True, how="left" + ) + adata.obs.rename(columns={adata.obs.columns[0]: "donor"}, inplace=True) adata.obs.donor = adata.obs.donor.fillna("negative") adata.obs.donor = adata.obs.donor.astype(str) - adata.write("genetic_summary/adata/adata_with_"+ os.path.basename(x)+".h5ad") + adata.write( + "genetic_summary/adata/adata_with_" + os.path.basename(x) + ".h5ad" + ) if raw_mudata is not None: mudata = raw_mudata.copy() - mudata['rna'].obs = mudata['rna'].obs.merge(obs_res, left_index=True, right_on='Barcode', how='left').set_index('Barcode') - mudata['rna'].obs.rename(columns={mudata['rna'].obs.columns[0]: 'donor'}, inplace=True) - mudata['rna'].obs.donor = mudata['rna'].obs.donor.fillna("negative") - mudata['rna'].obs.donor = mudata['rna'].obs.donor.astype(str) + mudata["rna"].obs = ( + mudata["rna"] + .obs.merge(obs_res, left_index=True, right_on="Barcode", how="left") + .set_index("Barcode") + ) + mudata["rna"].obs.rename( + columns={mudata["rna"].obs.columns[0]: "donor"}, inplace=True + ) + mudata["rna"].obs.donor = mudata["rna"].obs.donor.fillna("negative") + mudata["rna"].obs.donor = mudata["rna"].obs.donor.astype(str) mudata.update() - mudata.write("genetic_summary/mudata/mudata_with_"+ os.path.basename(x)+".h5mu") + mudata.write( + "genetic_summary/mudata/mudata_with_" + os.path.basename(x) + ".h5mu" + ) assign.append(obs_res) - params_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename.endswith("params.csv")][0]) + params_dir = os.path.join( + x, + [filename for filename in os.listdir(x) if filename.endswith("params.csv")][ + 0 + ], + ) params_res = pd.read_csv(params_dir, keep_default_na=False, index_col=0) params_res.columns = [os.path.basename(x)] params.append(params_res) @@ -166,6 +265,7 @@ def souporcell_summary(souporcell_res, raw_adata, raw_mudata): params = pd.concat(params, axis=1) params.to_csv("genetic_summary/souporcell_params.csv") + def vireo_summary(vireo_res, raw_adata, raw_mudata): assign = [] params = [] @@ -175,33 +275,52 @@ def vireo_summary(vireo_res, raw_adata, raw_mudata): for root, dirs, files in os.walk(x): if "donor_ids.tsv" in files: obs_res_dir = os.path.join(root, "donor_ids.tsv") - obs_res = pd.read_csv(os.path.join(x, obs_res_dir), sep='\t') - bs_res = obs_res.iloc[:, [0, 1]] + obs_res = pd.read_csv(os.path.join(x, obs_res_dir), sep="\t") + obs_res.iloc[:, [0, 1]] obs_res[obs_res == "unassigned"] = "negative" - obs_res.rename(columns={'cell': 'Barcode', 'donor_id': os.path.basename(x)}, inplace=True) - obs_res.set_index('Barcode', inplace=True) + obs_res.rename( + columns={"cell": "Barcode", "donor_id": os.path.basename(x)}, inplace=True + ) + obs_res.set_index("Barcode", inplace=True) obs_res = obs_res[[os.path.basename(x)]] if raw_adata is not None: adata = raw_adata.copy() - adata.obs = adata.obs.merge(obs_res, left_index=True, right_index=True, how='left') - adata.obs.rename(columns={adata.obs.columns[0]: 'donor'}, inplace=True) + adata.obs = adata.obs.merge( + obs_res, left_index=True, right_index=True, how="left" + ) + adata.obs.rename(columns={adata.obs.columns[0]: "donor"}, inplace=True) adata.obs.donor = adata.obs.donor.fillna("negative") adata.obs.donor = adata.obs.donor.astype(str) - adata.write("genetic_summary/adata/adata_with_"+ os.path.basename(x)+".h5ad") + adata.write( + "genetic_summary/adata/adata_with_" + os.path.basename(x) + ".h5ad" + ) if raw_mudata is not None: mudata = raw_mudata.copy() - mudata['rna'].obs = mudata['rna'].obs.merge(obs_res, left_index=True, right_on='Barcode', how='left').set_index('Barcode') - mudata['rna'].obs.rename(columns={mudata['rna'].obs.columns[0]: 'donor'}, inplace=True) - mudata['rna'].obs.donor = mudata['rna'].obs.donor.fillna("negative") - mudata['rna'].obs.donor = mudata['rna'].obs.donor.astype(str) + mudata["rna"].obs = ( + mudata["rna"] + .obs.merge(obs_res, left_index=True, right_on="Barcode", how="left") + .set_index("Barcode") + ) + mudata["rna"].obs.rename( + columns={mudata["rna"].obs.columns[0]: "donor"}, inplace=True + ) + mudata["rna"].obs.donor = mudata["rna"].obs.donor.fillna("negative") + mudata["rna"].obs.donor = mudata["rna"].obs.donor.astype(str) mudata.update() - mudata.write("genetic_summary/mudata/mudata_with_"+ os.path.basename(x)+".h5mu") + mudata.write( + "genetic_summary/mudata/mudata_with_" + os.path.basename(x) + ".h5mu" + ) assign.append(obs_res) - params_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename.endswith("params.csv")][0]) + params_dir = os.path.join( + x, + [filename for filename in os.listdir(x) if filename.endswith("params.csv")][ + 0 + ], + ) params_res = pd.read_csv(params_dir, keep_default_na=False, index_col=0) params_res.columns = [os.path.basename(x)] params.append(params_res) @@ -216,40 +335,65 @@ def vireo_summary(vireo_res, raw_adata, raw_mudata): params = pd.concat(params, axis=1) params.to_csv("genetic_summary/vireo_params.csv") + def scsplit_summary(scsplit_res, raw_adata, raw_mudata): assign = [] params = [] for x in scsplit_res: - obs_res_dir = next((os.path.join(root, "scSplit_result.csv") for root, dirs, files in os.walk(x) if "scSplit_result.csv" in files),"") + obs_res_dir = next( + ( + os.path.join(root, "scSplit_result.csv") + for root, dirs, files in os.walk(x) + if "scSplit_result.csv" in files + ), + "", + ) obs_res = pd.read_table(obs_res_dir) - obs_res['Assignment'] = obs_res['Cluster'].str.split('-').str[1] - obs_res['Classification'] = obs_res['Cluster'].str.split('-').str[0] - obs_res.loc[obs_res['Classification'] == 'DBL', 'Assignment'] = 'doublet' - obs_res = obs_res.drop(columns=['Cluster', 'Classification']) - obs_res.set_index('Barcode', inplace=True) + obs_res["Assignment"] = obs_res["Cluster"].str.split("-").str[1] + obs_res["Classification"] = obs_res["Cluster"].str.split("-").str[0] + obs_res.loc[obs_res["Classification"] == "DBL", "Assignment"] = "doublet" + obs_res = obs_res.drop(columns=["Cluster", "Classification"]) + obs_res.set_index("Barcode", inplace=True) obs_res.columns = [os.path.basename(x)] - + if raw_adata is not None: adata = raw_adata.copy() - adata.obs = adata.obs.merge(obs_res, left_index=True, right_index=True, how='left') - adata.obs.rename(columns={adata.obs.columns[0]: 'donor'}, inplace=True) + adata.obs = adata.obs.merge( + obs_res, left_index=True, right_index=True, how="left" + ) + adata.obs.rename(columns={adata.obs.columns[0]: "donor"}, inplace=True) adata.obs.donor = adata.obs.donor.fillna("negative") adata.obs.donor = adata.obs.donor.astype(str) - adata.write("genetic_summary/adata/adata_with_"+ os.path.basename(x)+".h5ad") - + adata.write( + "genetic_summary/adata/adata_with_" + os.path.basename(x) + ".h5ad" + ) + if raw_mudata is not None: mudata = raw_mudata.copy() - mudata['rna'].obs = mudata['rna'].obs.merge(obs_res, left_index=True, right_on='Barcode', how='left').set_index('Barcode') - mudata['rna'].obs.rename(columns={mudata['rna'].obs.columns[0]: 'donor'}, inplace=True) - mudata['rna'].obs.donor = mudata['rna'].obs.donor.fillna("negative") - mudata['rna'].obs.donor = mudata['rna'].obs.donor.astype(str) + mudata["rna"].obs = ( + mudata["rna"] + .obs.merge(obs_res, left_index=True, right_on="Barcode", how="left") + .set_index("Barcode") + ) + mudata["rna"].obs.rename( + columns={mudata["rna"].obs.columns[0]: "donor"}, inplace=True + ) + mudata["rna"].obs.donor = mudata["rna"].obs.donor.fillna("negative") + mudata["rna"].obs.donor = mudata["rna"].obs.donor.astype(str) mudata.update() - mudata.write("genetic_summary/mudata/mudata_with_"+ os.path.basename(x)+".h5mu") + mudata.write( + "genetic_summary/mudata/mudata_with_" + os.path.basename(x) + ".h5mu" + ) assign.append(obs_res) - params_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename.endswith("params.csv")][0]) + params_dir = os.path.join( + x, + [filename for filename in os.listdir(x) if filename.endswith("params.csv")][ + 0 + ], + ) params_res = pd.read_csv(params_dir, keep_default_na=False, index_col=0) params_res.columns = [os.path.basename(x)] params.append(params_res) @@ -258,18 +402,19 @@ def scsplit_summary(scsplit_res, raw_adata, raw_mudata): assign.to_csv("genetic_summary/scsplit_assignment.csv", quoting=False) classi = assign.copy() - classi[(classi != 'negative') & (classi != 'doublet')] = 'singlet' + classi[(classi != "negative") & (classi != "doublet")] = "singlet" classi.to_csv("genetic_summary/scsplit_classification.csv", quoting=False) - + params = pd.concat(params, axis=1) params.to_csv("genetic_summary/scsplit_params.csv") -if __name__ == '__main__': + +if __name__ == "__main__": adata = None mudata = None if not os.path.exists("genetic_summary"): os.mkdir("genetic_summary") - + if args.generate_anndata is True: os.mkdir("genetic_summary/adata") adata = sc.read_10x_mtx(args.read_rna_mtx) @@ -279,49 +424,61 @@ def scsplit_summary(scsplit_res, raw_adata, raw_mudata): os.mkdir("genetic_summary/mudata") rna_data = sc.read_10x_mtx(args.read_rna_mtx) hto_data = sc.read_10x_mtx(args.read_hto_mtx, gex_only=False) - mudata = MuData({"rna": rna_data, "hto": hto_data }) + mudata = MuData({"rna": rna_data, "hto": hto_data}) if args.demuxlet is not None: - demuxlet_res = args.demuxlet.split(':') + demuxlet_res = args.demuxlet.split(":") demuxlet_summary(demuxlet_res, adata, mudata) print("Demuxlet result found") if args.freemuxlet is not None: - freemuxlet_res = args.freemuxlet.split(':') + freemuxlet_res = args.freemuxlet.split(":") freemuxlet_summary(freemuxlet_res, adata, mudata) print("Freemuxlet result found") if args.vireo is not None: - vireo_res = args.vireo.split(':') + vireo_res = args.vireo.split(":") vireo_summary(vireo_res, adata, mudata) print("Vireo result found") if args.scsplit is not None: - scsplit_res = args.scsplit.split(':') + scsplit_res = args.scsplit.split(":") scsplit_summary(scsplit_res, adata, mudata) print("scSplit result found") if args.souporcell is not None: - souporcell_res = args.souporcell.split(':') + souporcell_res = args.souporcell.split(":") souporcell_summary(souporcell_res, adata, mudata) print("Souporcell result found") # Read and combine assignment files - assignment = [file for file in os.listdir("genetic_summary") if file.endswith("_assignment.csv")] + assignment = [ + file + for file in os.listdir("genetic_summary") + if file.endswith("_assignment.csv") + ] assignment_all = pd.read_csv(os.path.join("genetic_summary", assignment[0])) if len(assignment) > 1: for df in assignment[1:]: df = pd.read_csv(os.path.join("genetic_summary", df)) - assignment_all = pd.merge(assignment_all, df, on='Barcode', how='outer') + assignment_all = pd.merge(assignment_all, df, on="Barcode", how="outer") assignment_all.to_csv("genetic_summary/genetic_assignment_all.csv", index=False) # Read and combine classification files - classification = [file for file in os.listdir("genetic_summary") if file.endswith("_classification.csv")] + classification = [ + file + for file in os.listdir("genetic_summary") + if file.endswith("_classification.csv") + ] classification_all = pd.read_csv(os.path.join("genetic_summary", classification[0])) if len(classification) > 1: for df in classification[1:]: df = pd.read_csv(os.path.join("genetic_summary", df)) - classification_all = pd.merge(classification_all, df, on='Barcode', how='outer') - classification_all.to_csv("genetic_summary/genetic_classification_all.csv", index=False) \ No newline at end of file + classification_all = pd.merge( + classification_all, df, on="Barcode", how="outer" + ) + classification_all.to_csv( + "genetic_summary/genetic_classification_all.csv", index=False + ) diff --git a/bin/summary_hash.py b/bin/summary_hash.py index f93bb27..590e016 100755 --- a/bin/summary_hash.py +++ b/bin/summary_hash.py @@ -7,75 +7,119 @@ from mudata import MuData parser = argparse.ArgumentParser(description="Parameters for summary process") -parser.add_argument("--demuxem", help="Folder containing output files of demuxem", default=None) -parser.add_argument("--htodemux", help="Folder containing output files of htodemux", default=None) -parser.add_argument("--multiseq", help="Folder containing output files of multiseq", default=None) -parser.add_argument("--hashsolo", help="Folder containing output files of hashsolo", default=None) -parser.add_argument("--hashedDrops", help="Folder containing output files of hashedDrops", default=None) +parser.add_argument( + "--demuxem", help="Folder containing output files of demuxem", default=None +) +parser.add_argument( + "--htodemux", help="Folder containing output files of htodemux", default=None +) +parser.add_argument( + "--multiseq", help="Folder containing output files of multiseq", default=None +) +parser.add_argument( + "--hashsolo", help="Folder containing output files of hashsolo", default=None +) +parser.add_argument( + "--hashedDrops", help="Folder containing output files of hashedDrops", default=None +) parser.add_argument("--bff", help="Folder containing output files of BFF", default=None) -parser.add_argument("--gmm_demux", help="Folder containing output files of GMM-Demux", default=None) -parser.add_argument("--generate_anndata", help="Generate anndata", action='store_true') -parser.add_argument("--generate_mudata", help="Generate mudata", action='store_true') -parser.add_argument("--read_rna_mtx", help="10x-Genomics-formatted mtx directory for gene expression", default=None) -parser.add_argument("--read_hto_mtx", help="10x-Genomics-formatted mtx directory for HTO expression", default=None) -parser.add_argument("--sampleId", help="sampleID if multiple samples are demultiplexed", default=None) +parser.add_argument( + "--gmm_demux", help="Folder containing output files of GMM-Demux", default=None +) +parser.add_argument("--generate_anndata", help="Generate anndata", action="store_true") +parser.add_argument("--generate_mudata", help="Generate mudata", action="store_true") +parser.add_argument( + "--read_rna_mtx", + help="10x-Genomics-formatted mtx directory for gene expression", + default=None, +) +parser.add_argument( + "--read_hto_mtx", + help="10x-Genomics-formatted mtx directory for HTO expression", + default=None, +) +parser.add_argument( + "--sampleId", help="sampleID if multiple samples are demultiplexed", default=None +) args = parser.parse_args() + def demuxem_summary(demuxem_res, raw_adata, raw_mudata): assign = [] classi = [] params = [] for x in demuxem_res: - obs_res_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename.endswith("_obs.csv")][0]) + obs_res_dir = os.path.join( + x, + [filename for filename in os.listdir(x) if filename.endswith("_obs.csv")][ + 0 + ], + ) obs_res = pd.read_csv(obs_res_dir) obs_res.rename(columns={obs_res.columns[0]: "Barcode"}, inplace=True) demuxem_assign = obs_res[["Barcode", "assignment"]] demuxem_assign.columns = ["Barcode", os.path.basename(x)] - #demuxem_assign.loc[:, "Barcode"] = demuxem_assign["Barcode"].apply(lambda x: x + "-1") - #demuxem_assign["Barcode"] = demuxem_assign["Barcode"].astype(str) + "-1" + # demuxem_assign.loc[:, "Barcode"] = demuxem_assign["Barcode"].apply(lambda x: x + "-1") + # demuxem_assign["Barcode"] = demuxem_assign["Barcode"].astype(str) + "-1" demuxem_assign.index = demuxem_assign.Barcode - demuxem_assign = demuxem_assign.drop(columns=['Barcode']) + demuxem_assign = demuxem_assign.drop(columns=["Barcode"]) assign.append(demuxem_assign) print(assign) if raw_adata is not None: adata = raw_adata.copy() - adata.obs = adata.obs.merge(demuxem_assign, left_index=True, right_index=True, how='left') - adata.obs.rename(columns={adata.obs.columns[0]: 'donor'}, inplace=True) + adata.obs = adata.obs.merge( + demuxem_assign, left_index=True, right_index=True, how="left" + ) + adata.obs.rename(columns={adata.obs.columns[0]: "donor"}, inplace=True) adata.obs.donor = adata.obs.donor.fillna("negative") adata.obs.donor = adata.obs.donor.astype(str) - adata.write("hash_summary/adata/adata_with_"+os.path.basename(x)+".h5ad") - + adata.write( + "hash_summary/adata/adata_with_" + os.path.basename(x) + ".h5ad" + ) + if raw_mudata is not None: mudata = raw_mudata.copy() - mudata['rna'].obs = mudata['rna'].obs.merge(demuxem_assign, left_index=True, right_index=True, how='left') - mudata['rna'].obs.rename(columns={mudata['rna'].obs.columns[0]: 'donor'}, inplace=True) - mudata['rna'].obs.donor = mudata['rna'].obs.donor.fillna("negative") - mudata['rna'].obs.donor = mudata['rna'].obs.donor.astype(str) + mudata["rna"].obs = mudata["rna"].obs.merge( + demuxem_assign, left_index=True, right_index=True, how="left" + ) + mudata["rna"].obs.rename( + columns={mudata["rna"].obs.columns[0]: "donor"}, inplace=True + ) + mudata["rna"].obs.donor = mudata["rna"].obs.donor.fillna("negative") + mudata["rna"].obs.donor = mudata["rna"].obs.donor.astype(str) mudata.update() - mudata.write("hash_summary/mudata/mudata_with_mudata_"+ os.path.basename(x)+".h5mu") + mudata.write( + "hash_summary/mudata/mudata_with_mudata_" + + os.path.basename(x) + + ".h5mu" + ) demuxem_classi = obs_res[["Barcode", "demux_type"]] demuxem_classi.columns = ["Barcode", os.path.basename(x)] demuxem_classi = demuxem_classi.replace("unknown", "negative") - #demuxem_classi.loc[:, "Barcode"] = demuxem_classi["Barcode"].apply(lambda x: x + '-1') + # demuxem_classi.loc[:, "Barcode"] = demuxem_classi["Barcode"].apply(lambda x: x + '-1') demuxem_classi.index = demuxem_classi.Barcode - demuxem_classi = demuxem_classi.drop(columns=['Barcode']) + demuxem_classi = demuxem_classi.drop(columns=["Barcode"]) classi.append(demuxem_classi) - params_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename.endswith("params.csv")][0]) + params_dir = os.path.join( + x, + [filename for filename in os.listdir(x) if filename.endswith("params.csv")][ + 0 + ], + ) params_res = pd.read_csv(params_dir, keep_default_na=False, index_col=0) - #params_res.rename(columns={params_res.columns[1]: os.path.basename(x)}, inplace=True) + # params_res.rename(columns={params_res.columns[1]: os.path.basename(x)}, inplace=True) params_res.columns = [os.path.basename(x)] params.append(params_res) - assign = pd.concat(assign, axis=1) assign.to_csv("hash_summary/demuxem_assignment.csv", quoting=False) classi = pd.concat(classi, axis=1) classi.to_csv("hash_summary/demuxem_classification.csv", quoting=False) - + params = pd.concat(params, axis=1) params.to_csv("hash_summary/demuxem_params.csv") @@ -84,145 +128,244 @@ def hashsolo_summary(hashsolo_res, raw_adata, raw_mudata): assign = [] classi = [] params = [] - + for x in hashsolo_res: - obs_res_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename.endswith("_res.csv")][0]) + obs_res_dir = os.path.join( + x, + [filename for filename in os.listdir(x) if filename.endswith("_res.csv")][ + 0 + ], + ) obs_res = pd.read_csv(obs_res_dir, index_col=0) - obs_res.index.name='Barcode' + obs_res.index.name = "Barcode" hashsolo_assign = obs_res[["Classification"]] hashsolo_assign.columns = [os.path.basename(x)] - hashsolo_assign = hashsolo_assign.replace({"Doublet": "doublet", "Negative": "negative"}) + hashsolo_assign = hashsolo_assign.replace( + {"Doublet": "doublet", "Negative": "negative"} + ) assign.append(hashsolo_assign) - + if raw_adata is not None: adata = raw_adata.copy() - adata.obs = adata.obs.merge(hashsolo_assign, left_index=True, right_index=True, how='left') - adata.obs = adata.obs.rename(columns={adata.obs.columns[0]: 'donor'}) + adata.obs = adata.obs.merge( + hashsolo_assign, left_index=True, right_index=True, how="left" + ) + adata.obs = adata.obs.rename(columns={adata.obs.columns[0]: "donor"}) adata.obs.donor = adata.obs.donor.fillna("negative") adata.obs.donor = adata.obs.donor.astype(str) - adata.write("hash_summary/adata/adata_with_"+os.path.basename(x)+".h5ad") - + adata.write( + "hash_summary/adata/adata_with_" + os.path.basename(x) + ".h5ad" + ) + if raw_mudata is not None: mudata = raw_mudata.copy() - mudata['rna'].obs = mudata['rna'].obs.merge(hashsolo_assign, left_index=True, right_index=True, how='left') - mudata['rna'].obs.rename(columns={mudata['rna'].obs.columns[0]: 'donor'}, inplace=True) - mudata['rna'].obs.donor = mudata['rna'].obs.donor.fillna("negative") - mudata['rna'].obs.donor = mudata['rna'].obs.donor.astype(str) + mudata["rna"].obs = mudata["rna"].obs.merge( + hashsolo_assign, left_index=True, right_index=True, how="left" + ) + mudata["rna"].obs.rename( + columns={mudata["rna"].obs.columns[0]: "donor"}, inplace=True + ) + mudata["rna"].obs.donor = mudata["rna"].obs.donor.fillna("negative") + mudata["rna"].obs.donor = mudata["rna"].obs.donor.astype(str) mudata.update() - mudata.write("hash_summary/mudata/mudata_with_mudata_"+ os.path.basename(x)+".h5mu") + mudata.write( + "hash_summary/mudata/mudata_with_mudata_" + + os.path.basename(x) + + ".h5mu" + ) hashsolo_classi = obs_res[["most_likely_hypothesis"]] hashsolo_classi_copy = hashsolo_classi.copy() - hashsolo_classi_copy.loc[hashsolo_classi_copy["most_likely_hypothesis"] == 0.0 , "most_likely_hypothesis"] = "negative" - hashsolo_classi_copy.loc[hashsolo_classi_copy["most_likely_hypothesis"] == 1.0 , "most_likely_hypothesis"] = "singlet" - hashsolo_classi_copy.loc[hashsolo_classi_copy["most_likely_hypothesis"] == 2.0 , "most_likely_hypothesis"] = "doublet" - + hashsolo_classi_copy.loc[ + hashsolo_classi_copy["most_likely_hypothesis"] == 0.0, + "most_likely_hypothesis", + ] = "negative" + hashsolo_classi_copy.loc[ + hashsolo_classi_copy["most_likely_hypothesis"] == 1.0, + "most_likely_hypothesis", + ] = "singlet" + hashsolo_classi_copy.loc[ + hashsolo_classi_copy["most_likely_hypothesis"] == 2.0, + "most_likely_hypothesis", + ] = "doublet" + hashsolo_classi_copy.columns = [os.path.basename(x)] classi.append(hashsolo_classi_copy) - - params_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename.endswith("params.csv")][0]) + + params_dir = os.path.join( + x, + [filename for filename in os.listdir(x) if filename.endswith("params.csv")][ + 0 + ], + ) params_res = pd.read_csv(params_dir, keep_default_na=False, index_col=0) params_res.columns = [os.path.basename(x)] params.append(params_res) assign = pd.concat(assign, axis=1) assign.to_csv("hash_summary/hashsolo_assignment.csv", quoting=False) - + classi = pd.concat(classi, axis=1) classi.to_csv("hash_summary/hashsolo_classification.csv", quoting=False) - + params = pd.concat(params, axis=1) params.to_csv("hash_summary/hashsolo_params.csv") + def hasheddrops_summary(hasheddrops_res, raw_adata, raw_mudata): assign = [] classi = [] params = [] - + for x in hasheddrops_res: - obs_res_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename.endswith("_res.csv")][0]) + obs_res_dir = os.path.join( + x, + [filename for filename in os.listdir(x) if filename.endswith("_res.csv")][ + 0 + ], + ) obs_res = pd.read_csv(obs_res_dir) - - obs_res["Classification"] = np.where(obs_res["Confident"], "singlet", np.where(obs_res["Doublet"], "doublet", "negative")) - obs_res["Best"] = np.where(~obs_res["Classification"].isin(["doublet", "negative"]), obs_res["Best"], obs_res["Classification"]) + + obs_res["Classification"] = np.where( + obs_res["Confident"], + "singlet", + np.where(obs_res["Doublet"], "doublet", "negative"), + ) + obs_res["Best"] = np.where( + ~obs_res["Classification"].isin(["doublet", "negative"]), + obs_res["Best"], + obs_res["Classification"], + ) obs_res.rename(columns={obs_res.columns[0]: "Barcode"}, inplace=True) - + hasheddrops_res = obs_res[["Barcode", "Best"]] hasheddrops_res = hasheddrops_res.rename(columns={"Best": os.path.basename(x)}) assign.append(hasheddrops_res) if raw_adata is not None: adata = raw_adata.copy() - adata.obs = adata.obs.merge(hasheddrops_res, left_index=True, right_on='Barcode', how='left').set_index('Barcode') - adata.obs.rename(columns={adata.obs.columns[0]: 'donor'}, inplace=True) + adata.obs = adata.obs.merge( + hasheddrops_res, left_index=True, right_on="Barcode", how="left" + ).set_index("Barcode") + adata.obs.rename(columns={adata.obs.columns[0]: "donor"}, inplace=True) adata.obs.donor = adata.obs.donor.fillna("negative") adata.obs.donor = adata.obs.donor.astype(str) - adata.write("hash_summary/adata/adata_with_"+os.path.basename(x)+".h5ad") - + adata.write( + "hash_summary/adata/adata_with_" + os.path.basename(x) + ".h5ad" + ) + if raw_mudata is not None: mudata = raw_mudata.copy() - mudata['rna'].obs = mudata['rna'].obs.merge(hasheddrops_res, left_index=True, right_on='Barcode', how='left').set_index('Barcode') - mudata['rna'].obs.rename(columns={mudata['rna'].obs.columns[0]: 'donor'}, inplace=True) - mudata['rna'].obs.donor = mudata['rna'].obs.donor.fillna("negative") - mudata['rna'].obs.donor = mudata['rna'].obs.donor.astype(str) + mudata["rna"].obs = ( + mudata["rna"] + .obs.merge( + hasheddrops_res, left_index=True, right_on="Barcode", how="left" + ) + .set_index("Barcode") + ) + mudata["rna"].obs.rename( + columns={mudata["rna"].obs.columns[0]: "donor"}, inplace=True + ) + mudata["rna"].obs.donor = mudata["rna"].obs.donor.fillna("negative") + mudata["rna"].obs.donor = mudata["rna"].obs.donor.astype(str) mudata.update() - - mudata.write("hash_summary/mudata/mudata_with_mudata_"+ os.path.basename(x)+".h5mu") + mudata.write( + "hash_summary/mudata/mudata_with_mudata_" + + os.path.basename(x) + + ".h5mu" + ) hasheddrops_classi = obs_res[["Barcode", "Classification"]] - hasheddrops_classi = hasheddrops_classi.rename(columns={"Classification": os.path.basename(x)}) + hasheddrops_classi = hasheddrops_classi.rename( + columns={"Classification": os.path.basename(x)} + ) classi.append(hasheddrops_classi) - - params_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename.endswith("params.csv")][0]) - params_res = pd.read_csv(params_dir, usecols=[1, 2], keep_default_na=False, index_col=0) + + params_dir = os.path.join( + x, + [filename for filename in os.listdir(x) if filename.endswith("params.csv")][ + 0 + ], + ) + params_res = pd.read_csv( + params_dir, usecols=[1, 2], keep_default_na=False, index_col=0 + ) params_res.columns = [os.path.basename(x)] params.append(params_res) assign = pd.concat(assign, axis=1).reset_index(drop=True) assign.to_csv("hash_summary/hasheddrops_assignment.csv", index=False, quoting=False) - + classi = pd.concat(classi, axis=1).reset_index(drop=True) - classi.to_csv("hash_summary/hasheddrops_classification.csv", index=False, quoting=False) - + classi.to_csv( + "hash_summary/hasheddrops_classification.csv", index=False, quoting=False + ) + params = pd.concat(params, axis=1) params.to_csv("hash_summary/hasheddrops_params.csv") + def multiseq_summary(multiseq_res, raw_adata, raw_mudata): assign = [] params = [] for x in multiseq_res: - obs_res_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename.endswith("_res.csv")][0]) + obs_res_dir = os.path.join( + x, + [filename for filename in os.listdir(x) if filename.endswith("_res.csv")][ + 0 + ], + ) multiseq_assign = pd.read_csv(obs_res_dir) multiseq_assign.columns = ["Barcode", os.path.basename(x)] multiseq_assign.set_index("Barcode", inplace=True) - multiseq_assign.replace({"Doublet": "doublet", "Negative": "negative"}, inplace=True) + multiseq_assign.replace( + {"Doublet": "doublet", "Negative": "negative"}, inplace=True + ) assign.append(multiseq_assign) - + if raw_adata is not None: adata = raw_adata.copy() - adata.obs = adata.obs.merge(multiseq_assign, left_index=True, right_index=True, how='left') - adata.obs.rename(columns={adata.obs.columns[0]: 'donor'}, inplace=True) + adata.obs = adata.obs.merge( + multiseq_assign, left_index=True, right_index=True, how="left" + ) + adata.obs.rename(columns={adata.obs.columns[0]: "donor"}, inplace=True) adata.obs.donor = adata.obs.donor.fillna("negative") adata.obs.donor = adata.obs.donor.astype(str) - adata.write("hash_summary/adata/adata_with_"+os.path.basename(x)+".h5ad") + adata.write( + "hash_summary/adata/adata_with_" + os.path.basename(x) + ".h5ad" + ) if raw_mudata is not None: mudata = raw_mudata.copy() - mudata['rna'].obs = mudata['rna'].obs.merge(multiseq_assign, left_index=True, right_index=True, how='left') - mudata['rna'].obs.rename(columns={mudata['rna'].obs.columns[0]: 'donor'}, inplace=True) - mudata['rna'].obs.donor = mudata['rna'].obs.donor.fillna("negative") - mudata['rna'].obs.donor = mudata['rna'].obs.donor.astype(str) + mudata["rna"].obs = mudata["rna"].obs.merge( + multiseq_assign, left_index=True, right_index=True, how="left" + ) + mudata["rna"].obs.rename( + columns={mudata["rna"].obs.columns[0]: "donor"}, inplace=True + ) + mudata["rna"].obs.donor = mudata["rna"].obs.donor.fillna("negative") + mudata["rna"].obs.donor = mudata["rna"].obs.donor.astype(str) mudata.update() - mudata.write("hash_summary/mudata/mudata_with_mudata_"+ os.path.basename(x)+".h5mu") - - - params_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename.endswith("params.csv")][0]) - params_res = pd.read_csv(params_dir, usecols=[1, 2], keep_default_na=False, index_col=0) + mudata.write( + "hash_summary/mudata/mudata_with_mudata_" + + os.path.basename(x) + + ".h5mu" + ) + + params_dir = os.path.join( + x, + [filename for filename in os.listdir(x) if filename.endswith("params.csv")][ + 0 + ], + ) + params_res = pd.read_csv( + params_dir, usecols=[1, 2], keep_default_na=False, index_col=0 + ) params_res.columns = [os.path.basename(x)] params.append(params_res) - + assign = pd.concat(assign, axis=1) assign.to_csv("hash_summary/multiseq_assignment.csv", quoting=False) @@ -233,50 +376,84 @@ def multiseq_summary(multiseq_res, raw_adata, raw_mudata): params = pd.concat(params, axis=1) params.to_csv("hash_summary/multiseq_params.csv") + def htodemux_summary(htodemux_res, raw_adata, raw_mudata): assign = [] classi = [] params = [] for x in htodemux_res: - obs_res_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename.endswith("_assignment_htodemux.csv")][0]) + obs_res_dir = os.path.join( + x, + [ + filename + for filename in os.listdir(x) + if filename.endswith("_assignment_htodemux.csv") + ][0], + ) htodemux_assign = pd.read_csv(obs_res_dir) htodemux_assign.columns = ["Barcode", os.path.basename(x)] htodemux_assign.replace("Doublet", "doublet", inplace=True) htodemux_assign.replace("Negative", "negative", inplace=True) htodemux_assign.index = htodemux_assign.Barcode - htodemux_assign = htodemux_assign.drop(columns=['Barcode']) + htodemux_assign = htodemux_assign.drop(columns=["Barcode"]) assign.append(htodemux_assign) if raw_adata is not None: adata = raw_adata.copy() - adata.obs = adata.obs.merge(htodemux_assign, left_index=True, right_index=True, how='left') - adata.obs.rename(columns={adata.obs.columns[0]: 'donor'}, inplace=True) + adata.obs = adata.obs.merge( + htodemux_assign, left_index=True, right_index=True, how="left" + ) + adata.obs.rename(columns={adata.obs.columns[0]: "donor"}, inplace=True) adata.obs.donor = adata.obs.donor.fillna("negative") adata.obs.donor = adata.obs.donor.astype(str) - adata.write("hash_summary/adata/adata_with_"+os.path.basename(x)+".h5ad") + adata.write( + "hash_summary/adata/adata_with_" + os.path.basename(x) + ".h5ad" + ) if raw_mudata is not None: mudata = raw_mudata.copy() - mudata['rna'].obs = mudata['rna'].obs.merge(htodemux_assign, left_index=True, right_on='Barcode', how='left').set_index('Barcode') - mudata['rna'].obs.rename(columns={mudata['rna'].obs.columns[0]: 'donor'}, inplace=True) - mudata['rna'].obs.donor = mudata['rna'].obs.donor.fillna("negative") - mudata['rna'].obs.donor = mudata['rna'].obs.donor.astype(str) + mudata["rna"].obs = ( + mudata["rna"] + .obs.merge( + htodemux_assign, left_index=True, right_on="Barcode", how="left" + ) + .set_index("Barcode") + ) + mudata["rna"].obs.rename( + columns={mudata["rna"].obs.columns[0]: "donor"}, inplace=True + ) + mudata["rna"].obs.donor = mudata["rna"].obs.donor.fillna("negative") + mudata["rna"].obs.donor = mudata["rna"].obs.donor.astype(str) mudata.update() - mudata.write("hash_summary/mudata/mudata_with_mudata_"+ os.path.basename(x)+".h5mu") - - - obs_res_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename.endswith("_classification_htodemux.csv")][0]) + mudata.write( + "hash_summary/mudata/mudata_with_mudata_" + + os.path.basename(x) + + ".h5mu" + ) + + obs_res_dir = os.path.join( + x, + [ + filename + for filename in os.listdir(x) + if filename.endswith("_classification_htodemux.csv") + ][0], + ) htodemux_classi = pd.read_csv(obs_res_dir) htodemux_classi.columns = ["Barcode", os.path.basename(x)] htodemux_classi.replace("Singlet", "singlet", inplace=True) htodemux_classi.replace("Doublet", "doublet", inplace=True) htodemux_classi.replace("Negative", "negative", inplace=True) htodemux_classi.index = htodemux_classi.Barcode - htodemux_classi = htodemux_classi.drop(columns=['Barcode']) + htodemux_classi = htodemux_classi.drop(columns=["Barcode"]) classi.append(htodemux_classi) - params_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename == "params.csv"][0]) - params_res = pd.read_csv(params_dir, usecols=[1, 2], keep_default_na=False, index_col=0) + params_dir = os.path.join( + x, [filename for filename in os.listdir(x) if filename == "params.csv"][0] + ) + params_res = pd.read_csv( + params_dir, usecols=[1, 2], keep_default_na=False, index_col=0 + ) params_res.columns = [os.path.basename(x)] params.append(params_res) @@ -285,223 +462,311 @@ def htodemux_summary(htodemux_res, raw_adata, raw_mudata): classi = pd.concat(classi, axis=1) classi.to_csv("hash_summary/htodemux_classification.csv", quoting=False) - + params = pd.concat(params, axis=1) params.to_csv("hash_summary/htodemux_params.csv") - -def gmm_summary(gmmDemux_res,raw_adata, raw_mudata): + + +def gmm_summary(gmmDemux_res, raw_adata, raw_mudata): classi = [] assign = [] params = [] for x in gmmDemux_res: - obs_res_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename.endswith("GMM_full.csv")][0]) - #we get the number of hashes used for the experiment from the parameters - #so that we now how many kinds of singlets we could find in the assignment + obs_res_dir = os.path.join( + x, + [ + filename + for filename in os.listdir(x) + if filename.endswith("GMM_full.csv") + ][0], + ) + # we get the number of hashes used for the experiment from the parameters + # so that we now how many kinds of singlets we could find in the assignment params_dir = os.path.join(x, "params.csv") - params_res = pd.read_csv(params_dir,index_col=False) + params_res = pd.read_csv(params_dir, index_col=False) params_res.columns = ["Argument", os.path.basename(x)] params.append(params_res) - + ##### Get number of hashes used for the experiment - result_row = params_res[params_res['Argument'].str.contains('hto_name_gmm', case=False, na=False)] + result_row = params_res[ + params_res["Argument"].str.contains("hto_name_gmm", case=False, na=False) + ] hashes_used = "" if not result_row.empty: hashes_used = result_row[os.path.basename(x)].iloc[0] else: print("No row contains the number of hashes") - hashes = hashes_used.split(',') + hashes = hashes_used.split(",") number_of_hashes = len(hashes) - #----------------- + # ----------------- - #GMM assignement file - results GMM + # GMM assignement file - results GMM gmm_classi = pd.read_csv(obs_res_dir) - - #GMM full is the name given by GMM_demux per default to all results + # GMM full is the name given by GMM_demux per default to all results ##classif_file - contains the mapping of results classification_config = os.path.join(x, "GMM_full.config") - classif_file = pd.read_csv(classification_config,header=None) - - #Classification and Assigment come from the same file + classif_file = pd.read_csv(classification_config, header=None) + + # Classification and Assigment come from the same file gmm_dt = pd.DataFrame(gmm_classi) classification_dt = pd.DataFrame(classif_file) - - #change column names - classification_dt = classification_dt.rename(columns={0: "Cluster_id", 1: "assignment"}) - + + # change column names + classification_dt = classification_dt.rename( + columns={0: "Cluster_id", 1: "assignment"} + ) + gmm_dt = gmm_dt.rename(columns={"Unnamed: 0": "Barcode"}) - - #Create classification following the assignment found for the barcodes - #we keep the original assigment and add a classification column - def _classify_hash(row,number_hashes): + + # Create classification following the assignment found for the barcodes + # we keep the original assigment and add a classification column + def _classify_hash(row, number_hashes): print(f"current row: {row}") if row == 0: - return 'negative' + return "negative" elif row > 0 and row <= number_hashes: print("singlet found") - return 'singlet' + return "singlet" else: - return 'doublet' + return "doublet" - classification_dt['Classification'] = classification_dt['Cluster_id'].apply(lambda x: _classify_hash(x, number_of_hashes)) - - #Compare classification guide file with classification found - #merged = pd.merge(classification_dt, gmm_dt, on='Cluster_id', how='left') + classification_dt["Classification"] = classification_dt["Cluster_id"].apply( + lambda x: _classify_hash(x, number_of_hashes) + ) + + # Compare classification guide file with classification found + # merged = pd.merge(classification_dt, gmm_dt, on='Cluster_id', how='left') new_rows = [] for index, row in gmm_dt.iterrows(): - cluster_id = row['Cluster_id'] - matching_row_map = classification_dt[classification_dt['Cluster_id'] == cluster_id] + cluster_id = row["Cluster_id"] + matching_row_map = classification_dt[ + classification_dt["Cluster_id"] == cluster_id + ] if not matching_row_map.empty: # Extract assignment and classification values from the matching row in df2 - assignment_gmm = matching_row_map.iloc[0]['assignment'] - classification_gmm = matching_row_map.iloc[0]['Classification'] + assignment_gmm = matching_row_map.iloc[0]["assignment"] + classification_gmm = matching_row_map.iloc[0]["Classification"] new_row = { - 'Barcode': row['Barcode'], - 'Cluster_id': cluster_id, - 'assignment': assignment_gmm, - 'Classification': classification_gmm + "Barcode": row["Barcode"], + "Cluster_id": cluster_id, + "assignment": assignment_gmm, + "Classification": classification_gmm, } new_rows.append(new_row) merged = pd.DataFrame(new_rows) - - merged['assignment'] = merged.apply(lambda row: 'doublet' if 'doublet' in row['Classification'] else row['assignment'], axis=1) + + merged["assignment"] = merged.apply( + lambda row: "doublet" + if "doublet" in row["Classification"] + else row["assignment"], + axis=1, + ) print(merged) - gmm_dt['Classification'] = merged['Classification'] - gmm_dt['Assignment'] = merged['assignment'] - #instead of multiple hashes, add doublet - gmm_dt["Assignment"] = gmm_dt["Assignment"].apply(lambda x: "doublet" if "-" in x else x) - classification_dt['Classification'] = classification_dt['Classification'].str.replace(' ', '') - - - #Assigment for GMM-Demux + gmm_dt["Classification"] = merged["Classification"] + gmm_dt["Assignment"] = merged["assignment"] + # instead of multiple hashes, add doublet + gmm_dt["Assignment"] = gmm_dt["Assignment"].apply( + lambda x: "doublet" if "-" in x else x + ) + classification_dt["Classification"] = classification_dt[ + "Classification" + ].str.replace(" ", "") + + # Assigment for GMM-Demux #'Cluster_id', - gmm_dt_assign = gmm_dt.drop(['Cluster_id','Confidence','Classification' ], axis=1) - gmm_dt_assign['Assignment'] = gmm_dt_assign['Assignment'].str.replace(' ', '') + gmm_dt_assign = gmm_dt.drop( + ["Cluster_id", "Confidence", "Classification"], axis=1 + ) + gmm_dt_assign["Assignment"] = gmm_dt_assign["Assignment"].str.replace(" ", "") gmm_dt_assign.columns = ["Barcode", os.path.basename(x)] assign.append(gmm_dt_assign) if raw_adata is not None: adata = raw_adata.copy() - adata.obs = adata.obs.merge(gmm_dt_assign, left_index=True, right_on='Barcode', how='left').set_index('Barcode') - adata.obs.rename(columns={adata.obs.columns[0]: 'donor'}, inplace=True) + adata.obs = adata.obs.merge( + gmm_dt_assign, left_index=True, right_on="Barcode", how="left" + ).set_index("Barcode") + adata.obs.rename(columns={adata.obs.columns[0]: "donor"}, inplace=True) adata.obs.donor = adata.obs.donor.fillna("negative") adata.obs.donor = adata.obs.donor.astype(str) - adata.write("hash_summary/adata/adata_with_"+os.path.basename(x)+".h5ad") - + adata.write( + "hash_summary/adata/adata_with_" + os.path.basename(x) + ".h5ad" + ) + if raw_mudata is not None: mudata = raw_mudata.copy() - mudata['rna'].obs = mudata['rna'].obs.merge(gmm_dt_assign, left_index=True, right_on='Barcode', how='left').set_index('Barcode') - mudata['rna'].obs.rename(columns={mudata['rna'].obs.columns[0]: 'donor'}, inplace=True) - mudata['rna'].obs.donor = mudata['rna'].obs.donor.fillna("negative") - mudata['rna'].obs.donor = mudata['rna'].obs.donor.astype(str) + mudata["rna"].obs = ( + mudata["rna"] + .obs.merge( + gmm_dt_assign, left_index=True, right_on="Barcode", how="left" + ) + .set_index("Barcode") + ) + mudata["rna"].obs.rename( + columns={mudata["rna"].obs.columns[0]: "donor"}, inplace=True + ) + mudata["rna"].obs.donor = mudata["rna"].obs.donor.fillna("negative") + mudata["rna"].obs.donor = mudata["rna"].obs.donor.astype(str) mudata.update() - mudata.write("hash_summary/mudata/mudata_with_mudata_"+ os.path.basename(x)+".h5mu") - - #Classification for GMM-Demux - gmm_dt_classi = gmm_dt.drop(['Cluster_id','Confidence','Assignment' ], axis=1) - gmm_dt_classi.columns =["Barcode", os.path.basename(x)] + mudata.write( + "hash_summary/mudata/mudata_with_mudata_" + + os.path.basename(x) + + ".h5mu" + ) + + # Classification for GMM-Demux + gmm_dt_classi = gmm_dt.drop(["Cluster_id", "Confidence", "Assignment"], axis=1) + gmm_dt_classi.columns = ["Barcode", os.path.basename(x)] classi.append(gmm_dt_classi) params_dir = os.path.join(x, "params.csv") - params_res = pd.read_csv(params_dir,index_col=False) + params_res = pd.read_csv(params_dir, index_col=False) params_res.columns = ["Argument", os.path.basename(x)] params.append(params_res) classi_df = pd.concat(classi, axis=1, join="outer") - classi_df.to_csv("hash_summary" +"/GMM_classification.csv", index=False) - + classi_df.to_csv("hash_summary" + "/GMM_classification.csv", index=False) + assign_df = pd.concat(assign, axis=1, join="outer") - assign_df.to_csv("hash_summary" +"/GMM_assignment.csv", index=False, sep=",") - + assign_df.to_csv("hash_summary" + "/GMM_assignment.csv", index=False, sep=",") + params_df = pd.concat(params, axis=1, join="outer") - params_df.to_csv("hash_summary" +"/GMM_params.csv", index=False) + params_df.to_csv("hash_summary" + "/GMM_params.csv", index=False) -def bff_summary(bff_res,raw_adata, raw_mudata): + +def bff_summary(bff_res, raw_adata, raw_mudata): classi = [] assign = [] params = [] for x in bff_res: - obs_res_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename.endswith("_bff.csv")][0]) + obs_res_dir = os.path.join( + x, + [filename for filename in os.listdir(x) if filename.endswith("_bff.csv")][ + 0 + ], + ) bff_assign = pd.read_csv(obs_res_dir) data_bff = pd.DataFrame(bff_assign) if data_bff.empty: - #no results create empty dataframe for empty col - column_names = ['Barcode', os.path.basename(x)] + # no results create empty dataframe for empty col + column_names = ["Barcode", os.path.basename(x)] # Create an empty dataframe with only column names df = pd.DataFrame(columns=column_names) classi.append(df) assign.append(df) else: - #df contain data and we save it in the same way + # df contain data and we save it in the same way dt_assign = data_bff.copy() - #check if the columns contain results from both bff_s or only one - column_names = ["Unnamed: 0","bff_raw","bff_cluster","consensuscall.global"] + # check if the columns contain results from both bff_s or only one + column_names = [ + "Unnamed: 0", + "bff_raw", + "bff_cluster", + "consensuscall.global", + ] for column in column_names: if column in dt_assign.columns: dt_assign = dt_assign.drop([column], axis=1) - #dt_assign.loc[dt_assign["consensuscall"] == "Singlet", "consensuscall"] = "singlet" - dt_assign.loc[dt_assign["consensuscall"] == "Doublet", "consensuscall"] = "doublet" - dt_assign.loc[dt_assign["consensuscall"] == "Negative", "consensuscall"] = "negative" - dt_assign['consensuscall'] = dt_assign['consensuscall'].astype('category') - dt_assign = dt_assign.rename(columns={"cellbarcode": "Barcode", "consensuscall": os.path.basename(x)}) - dt_assign["Barcode"] = dt_assign["Barcode"].apply(lambda x: x + "-1" if isinstance(x, str) else x) - + # dt_assign.loc[dt_assign["consensuscall"] == "Singlet", "consensuscall"] = "singlet" + dt_assign.loc[ + dt_assign["consensuscall"] == "Doublet", "consensuscall" + ] = "doublet" + dt_assign.loc[ + dt_assign["consensuscall"] == "Negative", "consensuscall" + ] = "negative" + dt_assign["consensuscall"] = dt_assign["consensuscall"].astype("category") + dt_assign = dt_assign.rename( + columns={"cellbarcode": "Barcode", "consensuscall": os.path.basename(x)} + ) + dt_assign["Barcode"] = dt_assign["Barcode"].apply( + lambda x: x + "-1" if isinstance(x, str) else x + ) + assign.append(dt_assign) print("assign bff barcodes") print(assign) if raw_adata is not None: adata = raw_adata.copy() - adata.obs = adata.obs.merge(dt_assign, left_index=True, right_index=True, how='left') - adata.obs.rename(columns={adata.obs.columns[0]: 'donor'}, inplace=True) + adata.obs = adata.obs.merge( + dt_assign, left_index=True, right_index=True, how="left" + ) + adata.obs.rename(columns={adata.obs.columns[0]: "donor"}, inplace=True) adata.obs.donor = adata.obs.donor.fillna("negative") adata.obs.donor = adata.obs.donor.astype(str) - adata.write_h5ad("hash_summary/adata/adata_with_"+os.path.basename(x)+".h5ad") - + adata.write_h5ad( + "hash_summary/adata/adata_with_" + os.path.basename(x) + ".h5ad" + ) if raw_mudata is not None: mudata = raw_mudata.copy() - mudata['rna'].obs = mudata['rna'].obs.merge(dt_assign, left_index=True, right_index=True, how='left') - mudata['rna'].obs.rename(columns={mudata['rna'].obs.columns[0]: 'donor'}, inplace=True) - mudata['rna'].obs.donor = mudata['rna'].obs.donor.fillna("negative") - mudata['rna'].obs.donor = mudata['rna'].obs.donor.astype(str) + mudata["rna"].obs = mudata["rna"].obs.merge( + dt_assign, left_index=True, right_index=True, how="left" + ) + mudata["rna"].obs.rename( + columns={mudata["rna"].obs.columns[0]: "donor"}, inplace=True + ) + mudata["rna"].obs.donor = mudata["rna"].obs.donor.fillna("negative") + mudata["rna"].obs.donor = mudata["rna"].obs.donor.astype(str) mudata.update() - mudata.write("hash_summary/mudata/mudata_with_mudata_"+ os.path.basename(x)+".h5mu") + mudata.write( + "hash_summary/mudata/mudata_with_mudata_" + + os.path.basename(x) + + ".h5mu" + ) - dt_classi = data_bff.copy() - column_names_class = ["bff_raw","bff_cluster","consensuscall"] + column_names_class = ["bff_raw", "bff_cluster", "consensuscall"] for column in column_names_class: if column in dt_assign.columns: dt_classi = dt_classi.drop([column], axis=1) - dt_classi.loc[dt_classi["consensuscall.global"] == "Singlet", "consensuscall.global"] = "singlet" - dt_classi.loc[dt_classi["consensuscall.global"] == "Doublet", "consensuscall.global"] = "doublet" - dt_classi.loc[dt_classi["consensuscall.global"] == "Negative", "consensuscall.global"] = "negative" - dt_classi = dt_classi.rename(columns={"cellbarcode": "Barcode", "consensuscall.global": os.path.basename(x)}) - dt_classi["Barcode"] = dt_classi["Barcode"].apply(lambda x: x + "-1" if isinstance(x, str) else x) + dt_classi.loc[ + dt_classi["consensuscall.global"] == "Singlet", "consensuscall.global" + ] = "singlet" + dt_classi.loc[ + dt_classi["consensuscall.global"] == "Doublet", "consensuscall.global" + ] = "doublet" + dt_classi.loc[ + dt_classi["consensuscall.global"] == "Negative", "consensuscall.global" + ] = "negative" + dt_classi = dt_classi.rename( + columns={ + "cellbarcode": "Barcode", + "consensuscall.global": os.path.basename(x), + } + ) + dt_classi["Barcode"] = dt_classi["Barcode"].apply( + lambda x: x + "-1" if isinstance(x, str) else x + ) print("classification bff barcodes") print(assign) classi.append(dt_classi) - params_dir = os.path.join(x, [filename for filename in os.listdir(x) if filename == "params.csv"][0]) - params_res = pd.read_csv(params_dir, usecols=[1, 2], keep_default_na=False, index_col=0) + params_dir = os.path.join( + x, [filename for filename in os.listdir(x) if filename == "params.csv"][0] + ) + params_res = pd.read_csv( + params_dir, usecols=[1, 2], keep_default_na=False, index_col=0 + ) params_res.columns = [os.path.basename(x)] params.append(params_res) classi_df = pd.concat(classi, axis=1, join="outer") - classi_df.to_csv("hash_summary" +"/bff_classification.csv", index=False) - + classi_df.to_csv("hash_summary" + "/bff_classification.csv", index=False) + assign_df = pd.concat(assign, axis=1, join="outer") - assign_df.to_csv("hash_summary" +"/bff_assignment.csv", index=False) - + assign_df.to_csv("hash_summary" + "/bff_assignment.csv", index=False) + params = pd.concat(params, axis=1) - params.to_csv("hash_summary" +"/bff_params.csv") + params.to_csv("hash_summary" + "/bff_params.csv") + -if __name__ == '__main__': +if __name__ == "__main__": adata = None mudata = None @@ -517,61 +782,66 @@ def bff_summary(bff_res,raw_adata, raw_mudata): rna_data = sc.read_10x_mtx(args.read_rna_mtx) path_hto = args.read_hto_mtx hto_data = sc.read_10x_mtx(args.read_hto_mtx, gex_only=False) - mudata = MuData({"rna": rna_data, "hto": hto_data }) - + mudata = MuData({"rna": rna_data, "hto": hto_data}) + if args.hashedDrops is not None: - hashedDrops_res = args.hashedDrops.split(':') + hashedDrops_res = args.hashedDrops.split(":") hasheddrops_summary(hashedDrops_res, adata, mudata) if args.demuxem is not None: - demuxem_res = args.demuxem.split(':') + demuxem_res = args.demuxem.split(":") demuxem_summary(demuxem_res, adata, mudata) if args.hashsolo is not None: - hashsolo_res = args.hashsolo.split(':') + hashsolo_res = args.hashsolo.split(":") hashsolo_summary(hashsolo_res, adata, mudata) if args.multiseq is not None: - multiseq_res = args.multiseq.split(':') + multiseq_res = args.multiseq.split(":") multiseq_summary(multiseq_res, adata, mudata) if args.htodemux is not None: - htodemux_res = args.htodemux.split(':') + htodemux_res = args.htodemux.split(":") htodemux_summary(htodemux_res, adata, mudata) - if args.gmm_demux is not None: - gmmDemux_res = args.gmm_demux.split(':') + gmmDemux_res = args.gmm_demux.split(":") gmm_summary(gmmDemux_res, adata, mudata) if args.bff is not None: - bff_res = args.bff.split(':') + bff_res = args.bff.split(":") bff_summary(bff_res, adata, mudata) - # Read and combine assignment files - assignment = [file for file in os.listdir("hash_summary") if file.endswith("_assignment.csv")] + assignment = [ + file for file in os.listdir("hash_summary") if file.endswith("_assignment.csv") + ] assignment_all = pd.read_csv(os.path.join("hash_summary", assignment[0])) - - if len(assignment) > 1: for df in assignment[1:]: - df = pd.read_csv(os.path.join("hash_summary", df)) - assignment_all = pd.merge(assignment_all, df, on='Barcode', how='outer') - #print("---------assignment all--------") - #print(assignment_all) - #print("-----------------") - + assignment_all = pd.merge(assignment_all, df, on="Barcode", how="outer") + # print("---------assignment all--------") + # print(assignment_all) + # print("-----------------") + assignment_all.to_csv("hash_summary/hashing_assignment_all.csv", index=False) # Read and combine classification files - classification = [file for file in os.listdir("hash_summary") if file.endswith("_classification.csv")] + classification = [ + file + for file in os.listdir("hash_summary") + if file.endswith("_classification.csv") + ] classification_all = pd.read_csv(os.path.join("hash_summary", classification[0])) if len(classification) > 1: for df in classification[1:]: df = pd.read_csv(os.path.join("hash_summary", df)) - classification_all = pd.merge(classification_all, df, on='Barcode', how='outer') - classification_all.to_csv("hash_summary/hashing_classification_all.csv", index=False) \ No newline at end of file + classification_all = pd.merge( + classification_all, df, on="Barcode", how="outer" + ) + classification_all.to_csv( + "hash_summary/hashing_classification_all.csv", index=False + ) diff --git a/conda/bff.yml b/conda/bff.yml index e014dff..34b6db8 100644 --- a/conda/bff.yml +++ b/conda/bff.yml @@ -18,4 +18,3 @@ dependencies: - conda-forge::r-here - conda-forge::r-argparse - conda-forge::r-dplyr - diff --git a/conda/condaenv.ucda_93c.requirements.txt b/conda/condaenv.ucda_93c.requirements.txt deleted file mode 100644 index 4b3fe5b..0000000 --- a/conda/condaenv.ucda_93c.requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -GMM_Demux==0.2.1.3 -scikit-learn==1.1.3 -argparse \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index 5678883..d91c599 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -1,22 +1,21 @@ -project = 'hadge' -copyright = '2023, Fabiola Curion, Xichen Wu, Lukas Heumos' -author = 'Fabiola Curion, Xichen Wu, Lukas Heumos' -release = '1.0.0' +project = "hadge" +copyright = "2023, Fabiola Curion, Xichen Wu, Lukas Heumos" +author = "Fabiola Curion, Xichen Wu, Lukas Heumos" +release = "1.0.0" -extensions = ['myst_parser', - 'nbsphinx'] +extensions = ["myst_parser", "nbsphinx"] source_suffix = { - '.rst': 'restructuredtext', - '.txt': 'markdown', - '.md': 'markdown', + ".rst": "restructuredtext", + ".txt": "markdown", + ".md": "markdown", } -templates_path = ['_templates'] +templates_path = ["_templates"] -exclude_patterns = ['_build', '**.ipynb_checkpoints'] +exclude_patterns = ["_build", "**.ipynb_checkpoints"] html_theme = "furo" -html_static_path = ['_static'] +html_static_path = ["_static"] -nbsphinx_execute = 'never' +nbsphinx_execute = "never" diff --git a/subworkflows/HADGE.nf b/subworkflows/HADGE.nf index 33046b4..77489d4 100644 --- a/subworkflows/HADGE.nf +++ b/subworkflows/HADGE.nf @@ -42,13 +42,10 @@ workflow HADGE { workflow SUMMARY{ - - log.info('running summary only') Channel.fromPath(params.multi_input).splitCsv(header:true).map { row-> tuple(row.sampleId, file(row.hto_matrix_filtered), file(row.rna_matrix_filtered))}.set {input_list_summary} - demuxlet_out = Channel.fromPath("${params.outdir}/*/genetic/gene_demulti/demuxlet/demuxlet_*", type: 'dir').collect().ifEmpty('no_result') freemuxlet_out= Channel.fromPath("${params.outdir}/*/genetic/gene_demulti/freemuxlet/freemuxlet_*", type: 'dir').collect().ifEmpty('no_result') vireo_out= Channel.fromPath("${params.outdir}/*/genetic/gene_demulti/vireo/vireo_*", type: 'dir').collect().ifEmpty('no_result') @@ -66,11 +63,4 @@ workflow SUMMARY{ summary(summary_input, params.generate_anndata, params.generate_mudata) - - // Channel.fromPath(params.multi_input) \ - // | splitCsv(header:true) \ - // | map { row-> tuple(row.sampleId, row.nsample, row.barcodes, "None", "None")} - // | join(summary.out) - // | donor_match - } \ No newline at end of file From cf1090898a9a3e783b6ef0b9927380e918e08daf Mon Sep 17 00:00:00 2001 From: zethson Date: Wed, 7 Feb 2024 11:54:09 +0100 Subject: [PATCH 10/16] Refactoring Signed-off-by: zethson --- modules/multi_demultiplexing.nf | 8 -------- 1 file changed, 8 deletions(-) diff --git a/modules/multi_demultiplexing.nf b/modules/multi_demultiplexing.nf index 7aed27f..ca4d690 100644 --- a/modules/multi_demultiplexing.nf +++ b/modules/multi_demultiplexing.nf @@ -69,9 +69,7 @@ workflow run_multi{ main: if (params.mode == "genetic"){ - // Performing genetic demultiplexing methodologies gene_demultiplexing(input_channel) - //////////// if (params.match_donor == "True"){ @@ -80,9 +78,7 @@ workflow run_multi{ } else if (params.mode == "hashing"){ - // Performing hashing demultplexing hash_demultiplexing(input_channel) - //////////// if (params.match_donor == "True"){ input_channel.splitCsv(header:true).map { row -> tuple(row.sampleId, row.nsample, row.barcodes, "None", "None")}.join(hash_demultiplexing.out).set{dm_input} @@ -90,10 +86,8 @@ workflow run_multi{ } else if (params.mode == "rescue"){ - // Performing both hashing and genetic demultiplexing methods hash_demultiplexing(input_channel) gene_demultiplexing(input_channel) - //////////// gene_summary = gene_demultiplexing.out hash_summary = hash_demultiplexing.out @@ -106,11 +100,9 @@ workflow run_multi{ } else if (params.mode == "donor_match"){ - // Performing just donor matching input_channel.splitCsv(header:true).map { row -> tuple(row.sampleId, row.nsample, row.barcodes, row.celldata, row.vireo_parent_dir, row.demultiplexing_result)}.set{dm_input} } - if (params.match_donor == "True" || params.mode == "donor_match"){ donor_match(dm_input) From 1fdd3ccb9c169cc19cb78fb5e7c6407794c16b9c Mon Sep 17 00:00:00 2001 From: zethson Date: Wed, 7 Feb 2024 12:01:18 +0100 Subject: [PATCH 11/16] Fix violin Signed-off-by: zethson --- bin/demuxem.py | 2 +- docs/source/general.md | 20 +++++++++----------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/bin/demuxem.py b/bin/demuxem.py index d03ae41..28ab22b 100755 --- a/bin/demuxem.py +++ b/bin/demuxem.py @@ -156,7 +156,7 @@ rna_data.X.data = np.log1p(rna_data.X.data) for gene_name in args.generateGenderPlot: - demuxEM.plot_gene_violin( + io.violin( rna_data, gene_name, "{output_name}.{gene_name}.violin.pdf".format( diff --git a/docs/source/general.md b/docs/source/general.md index 5a0d9cb..7ea4440 100644 --- a/docs/source/general.md +++ b/docs/source/general.md @@ -2,7 +2,7 @@ ## **hadge: a comprehensive pipeline for donor deconvolution in single cell** -Preprint manuscript is available [here](https://www.biorxiv.org/content/10.1101/2023.07.23.550061v2) +A preprint is available [here](https://www.biorxiv.org/content/10.1101/2023.07.23.550061v2). ![Caption](_static/images/pipeline.png) @@ -17,7 +17,7 @@ The mode of the pipeline is set by `params.mode`. hadge provides 4 modes in tota ## **Pipeline configuration** -The pipeline provides some pre-defined profiles. The standard profile is used by default when no profile is specified, where the pipeeline is run locall and all processes annotated with the big_mem label are assigned 4 cpus and 16 Gb of memory. +The pipeline provides some pre-defined profiles. The standard profile is used by default when no profile is specified, where the pipeline is run locally and all processes annotated with the big_mem label are assigned 4 cpus and 16 Gb of memory. ``` profiles{ @@ -37,9 +37,10 @@ profiles{ } ``` -### Conda environments: +### Conda environments -By using the `-profile conda` option, the pipeline executes each process within a Conda environment specified in the conda directive. Alternatively, you have the flexibility to add a new profile in the `nextflow.config` file, allowing you to use local Conda environments for running processes. +By using the `-profile conda` option, the pipeline executes each process within a Conda environment specified in the conda directive. +Alternatively, you have the flexibility to add a new profile in the `nextflow.config` file, allowing you to use local Conda environments for running processes. ``` profiles{ @@ -64,7 +65,7 @@ profiles{ } ``` -### Containers: +### Containers Nextflow also supports a variety of container runtimes, e.g. Docker. To specify a different Docker image for each process: @@ -77,9 +78,6 @@ profiles{ withName:foo { container = 'image_name_1' } - withName:bar { - container = 'image_name_2' - } } } } @@ -87,10 +85,9 @@ profiles{ ``` -### Executor and resource specifications: +### Executor and resource specifications - The pipeline can also be run on an HPC. You can set the executor by running the pipeline with `-profile cluster`. -- Feel free to add other configurations, e.g. the number of CPUS, the memory allocation, etc. If you are new to Nextflow framework, please visit the [Nextlfow page](https://www.nextflow.io/docs/latest/config.html#). ``` cluster { @@ -123,7 +120,8 @@ nextflow run main.nf -profile standard,conda ### **Running on multiple samples** -The pipeline is able to run on multiple samples. In this scenario, the shared parameters for input data are retrieved from a sample sheet using `params.multi_sample`, which is set to None by default. +The pipeline is able to run on multiple samples. +In this scenario, the shared parameters for input data are retrieved from a sample sheet using `params.multi_sample`, which is set to None by default. Along with the input data, the sample sheet should contain an additional column for unique sample IDs assigned to each sample. The remaining parameters for each process are specified in the nextflow.config file, just like when demultiplexing a single sample. However, there is a distinction between running on a single sample and running on multiple samples. When processing multiple samples, the pipeline only permits a single value for each process parameter, whereas in the case of a single sample, multiple values separated by commas are allowed. The sample sheet (example file see the Resources section below) should have e.g. following columns depending on the methods you want to run: From bd76007c3725323a030766ee1b8055f262d8cb69 Mon Sep 17 00:00:00 2001 From: zethson Date: Wed, 7 Feb 2024 12:04:24 +0100 Subject: [PATCH 12/16] Fix violin Signed-off-by: zethson --- bin/demuxem.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/demuxem.py b/bin/demuxem.py index 28ab22b..5096a7f 100755 --- a/bin/demuxem.py +++ b/bin/demuxem.py @@ -156,7 +156,7 @@ rna_data.X.data = np.log1p(rna_data.X.data) for gene_name in args.generateGenderPlot: - io.violin( + pg.violin( rna_data, gene_name, "{output_name}.{gene_name}.violin.pdf".format( From 686bf1cb8e5e93ad6f5f2755ce15771aa964a176 Mon Sep 17 00:00:00 2001 From: Xichen Wu <102925032+wxicu@users.noreply.github.com> Date: Thu, 8 Feb 2024 01:56:06 +0100 Subject: [PATCH 13/16] update summary component of hashing workflow replace the str.contains part --- modules/multi/hash_demultiplexing.nf | 38 +++++++++++++++------------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/modules/multi/hash_demultiplexing.nf b/modules/multi/hash_demultiplexing.nf index 4f204e0..13903bd 100644 --- a/modules/multi/hash_demultiplexing.nf +++ b/modules/multi/hash_demultiplexing.nf @@ -17,14 +17,7 @@ process summary{ conda "pandas scanpy mudata" input: - tuple val(sampleId), path(hto_matrix, stageAs: 'hto_data'), path(rna_matrix, stageAs: 'rna_data') - val demuxem_result - val hashsolo_result - val htodemux_result - val multiseq_result - val hashedDrops_result - val bff_result - val gmmDemux_result + tuple(val(sampleId), path(hto_matrix, stageAs: 'hto_data'), path(rna_matrix, stageAs: 'rna_data'), val(hashedDrops_result), val(demuxem_result), val(hashsolo_result), val(multiseq_result), val(htodemux_result), val(gmmDemux_result), val(bff_result)) val generate_anndata val generate_mudata @@ -43,31 +36,24 @@ process summary{ def generate_mdata = "" if (demuxem_result != "no_result"){ - demuxem_res = demuxem_result.find{it.name.contains(sampleId)} demuxem_files = "--demuxem ${demuxem_res}" } if (hashsolo_result != "no_result"){ - hashsolo_res = hashsolo_result.find{it.name.contains(sampleId)} hashsolo_files = "--hashsolo ${hashsolo_res}" } if (htodemux_result != "no_result"){ - htodemux_res = htodemux_result.find{it.name.contains(sampleId)} htodemux_files = "--htodemux ${htodemux_res}" } if (multiseq_result != "no_result"){ - multiseq_res = multiseq_result.find{it.name.contains(sampleId)} multiseq_files = "--multiseq ${multiseq_res}" } if (hashedDrops_result != "no_result"){ - hashedDrops_res = hashedDrops_result.find{it.name.contains(sampleId)} hashedDrops_files = "--hashedDrops ${hashedDrops_res}" } if (gmmDemux_result != "no_result"){ - gmmDemux_res = gmmDemux_result.find{it.name.contains(sampleId)} gmmDemux_files = "--gmm_demux ${gmmDemux_res}" } if (bff_result != "no_result"){ - bff_res = bff_result.find{it.name.contains(sampleId)} bff_files = "--bff ${bff_res}" } if (generate_anndata == "True"){ @@ -87,7 +73,7 @@ process summary{ } """ - summary_hash.py $demuxem_files $htodemux_files $multiseq_files $hashedDrops_files $hashsolo_files $gmmDemux_files $bff_files $generate_adata $generate_mdata --sampleId $sampleId + summary_hash.py $demuxem_files $htodemux_files $multiseq_files $hashedDrops_files $hashsolo_files $gmmDemux_files $bff_files $generate_adata $generate_mdata """ } @@ -189,9 +175,25 @@ workflow hash_demultiplexing{ gmmDemux_out = channel.value("no_result") } - input_channel.splitCsv(header:true).map { row-> tuple(row.sampleId, file(row.hto_matrix_filtered), file(row.rna_matrix_filtered))}.set {input_list_summary} + ////////// + //Summary + ////////// + + input_list_summary = input_channel.splitCsv(header:true).map { row-> tuple(row.sampleId, file(row.hto_matrix_filtered), file(row.rna_matrix_filtered))} - summary(input_list_summary, demuxem_out, hashsolo_out, htodemux_out, multiseq_out, hashedDrops_out,bff_out,gmmDemux_out, params.generate_anndata, params.generate_mudata) + htodemux_out_ch = htodemux_out.flatten().map{r1-> tuple( "$r1".replaceAll(".*htodemux_",""), r1 )} + multiseq_out_ch = multiseq_out.flatten().map{r1-> tuple( "$r1".replaceAll(".*multiseq_",""), r1 )} + hashsolo_out_ch = hashsolo_out.flatten().map{r1-> tuple( "$r1".replaceAll(".*hashsolo_",""), r1 )} + demuxem_out_ch = demuxem_out.flatten().map{r1-> tuple( "$r1".replaceAll(".*demuxem_",""), r1 )} + hashedDrops_out_ch = hashedDrops_out.flatten().map{r1-> tuple( "$r1".replaceAll(".*hashedDrops_",""), r1 )} + bff_out_ch = bff_out.flatten().map{r1-> tuple( "$r1".replaceAll(".*bff_",""), r1 )} + gmmDemux_out_ch = gmmDemuxout.flatten().map{r1-> tuple( "$r1".replaceAll(".*gmmDemux",""), r1 )} + + summary_input = input_list_summary.join(hashedDrops_out_ch,by:0,remainder: true).join(demuxem_out_ch,by:0,remainder: true).join(hashsolo_out_ch,by:0,remainder: true).join(multiseq_out_ch,by:0,remainder: true).join(htodemux_out_ch,by:0,remainder: true).join(gmmDemux_out_ch,by:0,remainder: true).join(bff_out_ch,by:0,remainder: true) + summary_input = summary_input.filter{ it[0] != 'no_result' } + + summary(summary_input, + params.generate_anndata, params.generate_mudata) emit: summary.out From 7399229579973dfd9a6a4f871d7df30f586ced76 Mon Sep 17 00:00:00 2001 From: Xichen Wu <102925032+wxicu@users.noreply.github.com> Date: Thu, 8 Feb 2024 01:56:56 +0100 Subject: [PATCH 14/16] remove unused sampleid param --- bin/summary_hash.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/bin/summary_hash.py b/bin/summary_hash.py index 590e016..d6678f5 100755 --- a/bin/summary_hash.py +++ b/bin/summary_hash.py @@ -38,10 +38,6 @@ help="10x-Genomics-formatted mtx directory for HTO expression", default=None, ) -parser.add_argument( - "--sampleId", help="sampleID if multiple samples are demultiplexed", default=None -) - args = parser.parse_args() From 7954340dfea9d9e9737adf4ec55deb7cc8dd5e14 Mon Sep 17 00:00:00 2001 From: Xichen Wu <102925032+wxicu@users.noreply.github.com> Date: Thu, 8 Feb 2024 02:04:08 +0100 Subject: [PATCH 15/16] fix typo fix typo of gmmdemuxout --- modules/multi/hash_demultiplexing.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/multi/hash_demultiplexing.nf b/modules/multi/hash_demultiplexing.nf index 13903bd..48397e3 100644 --- a/modules/multi/hash_demultiplexing.nf +++ b/modules/multi/hash_demultiplexing.nf @@ -187,7 +187,7 @@ workflow hash_demultiplexing{ demuxem_out_ch = demuxem_out.flatten().map{r1-> tuple( "$r1".replaceAll(".*demuxem_",""), r1 )} hashedDrops_out_ch = hashedDrops_out.flatten().map{r1-> tuple( "$r1".replaceAll(".*hashedDrops_",""), r1 )} bff_out_ch = bff_out.flatten().map{r1-> tuple( "$r1".replaceAll(".*bff_",""), r1 )} - gmmDemux_out_ch = gmmDemuxout.flatten().map{r1-> tuple( "$r1".replaceAll(".*gmmDemux",""), r1 )} + gmmDemux_out_ch = gmmDemux_out.flatten().map{r1-> tuple( "$r1".replaceAll(".*gmmDemux",""), r1 )} summary_input = input_list_summary.join(hashedDrops_out_ch,by:0,remainder: true).join(demuxem_out_ch,by:0,remainder: true).join(hashsolo_out_ch,by:0,remainder: true).join(multiseq_out_ch,by:0,remainder: true).join(htodemux_out_ch,by:0,remainder: true).join(gmmDemux_out_ch,by:0,remainder: true).join(bff_out_ch,by:0,remainder: true) summary_input = summary_input.filter{ it[0] != 'no_result' } From 919079c41fec4be933efaac32e0055a590d6f092 Mon Sep 17 00:00:00 2001 From: Xichen Wu <102925032+wxicu@users.noreply.github.com> Date: Thu, 8 Feb 2024 09:11:25 +0100 Subject: [PATCH 16/16] add bcftools in vireo conda env add bcftools --- modules/multi/gene_demulti/vireo.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/multi/gene_demulti/vireo.nf b/modules/multi/gene_demulti/vireo.nf index d840ff1..92e6f7b 100755 --- a/modules/multi/gene_demulti/vireo.nf +++ b/modules/multi/gene_demulti/vireo.nf @@ -5,7 +5,7 @@ process vireo{ publishDir "$params.outdir/$sampleId/$params.mode/gene_demulti/vireo", mode: 'copy' label 'big_mem' tag "${sampleId}" - conda "aksarkar::vireosnp" + conda "aksarkar::vireosnp bioconda::bcftools" input: tuple val(sampleId), path(celldata), val(ndonor), val(donorfile)