From f2503811f5574ab3795cf35c9f0058676145bcf7 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Tue, 20 Jun 2023 10:37:18 +0100 Subject: [PATCH 01/48] Updated Dockerfile --- modules/local/ampliconsuite/Dockerfile | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/modules/local/ampliconsuite/Dockerfile b/modules/local/ampliconsuite/Dockerfile index 90e3e657..d502e6cd 100644 --- a/modules/local/ampliconsuite/Dockerfile +++ b/modules/local/ampliconsuite/Dockerfile @@ -1,12 +1,4 @@ -FROM python:3.10 +FROM continuumio/miniconda3 # Install Python packages -RUN pip install --no-cache-dir \ - pysam==0.21.0 \ - flask==2.3.2 \ - numpy==1.24.3 \ - matplotlib==3.7.1 \ - scipy==1.10.0 \ - intervaltree==3.1.0 \ - future==0.18.3 \ - Mosek==9.3.22 +RUN conda install -c bioconda -c mosek ampliconsuite=0.1555.2 mosek=10.1b1 From 2204d2e6ed14b2b87c49376532f4faf2acd2dc22 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Tue, 20 Jun 2023 10:42:24 +0100 Subject: [PATCH 02/48] Removed versions --- modules/local/ampliconsuite/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/ampliconsuite/Dockerfile b/modules/local/ampliconsuite/Dockerfile index d502e6cd..47419888 100644 --- a/modules/local/ampliconsuite/Dockerfile +++ b/modules/local/ampliconsuite/Dockerfile @@ -1,4 +1,4 @@ FROM continuumio/miniconda3 # Install Python packages -RUN conda install -c bioconda -c mosek ampliconsuite=0.1555.2 mosek=10.1b1 +RUN conda install -c bioconda -c mosek ampliconsuite mosek From 616e15719bb502b191057d53e1f314ef73b5e624 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Tue, 20 Jun 2023 13:50:04 +0100 Subject: [PATCH 03/48] Updates - Unsure what they were --- conf/modules.config | 26 +++------ .../ampliconclassifier/ampliconclassifier.nf | 1 + .../ampliconclassifier/featuresimilarity.nf | 57 +++++++++++++++++++ modules/local/cnvkit/segment.nf | 2 +- modules/local/samplesheet_check.nf | 2 +- nextflow.config | 4 ++ workflows/circdna.nf | 24 +++++--- 7 files changed, 86 insertions(+), 30 deletions(-) create mode 100644 modules/local/ampliconclassifier/featuresimilarity.nf diff --git a/conf/modules.config b/conf/modules.config index d60c33bd..0c0b57fc 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -330,6 +330,8 @@ process { ] } withName: 'PREPAREAA' { + wave.enabled = true + wave.strategy = ['conda'] ext.args = "" publishDir = [ path: { "${params.outdir}/prepareaa" }, @@ -402,27 +404,13 @@ process { enabled: true ] } - withName: 'AMPLICONCLASSIFIER_AMPLICONSIMILARITY' { + withName: 'AMPLICONCLASSIFIER_FEATURESIMILARITY' { ext.args = "" publishDir = [ - [ - path: { "${params.outdir}/ampliconclassifier/ampliconclassifier/input" }, - mode: params.publish_dir_mode, - pattern: '*.input', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ], - [ - path: { "${params.outdir}/ampliconclassifier/ampliconsimilarity/log" }, - mode: params.publish_dir_mode, - pattern: '*.log', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - ], - [ - path: { "${params.outdir}/ampliconclassifier/ampliconsimilarity/similarity" }, - mode: params.publish_dir_mode, - pattern: '*_similarity_scores.tsv', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - ], + path: { "${params.outdir}/ampliconclassifier/featuresimilarity" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: true ] } withName: 'AMPLICONCLASSIFIER_MAKEINPUT' { diff --git a/modules/local/ampliconclassifier/ampliconclassifier.nf b/modules/local/ampliconclassifier/ampliconclassifier.nf index 299f7168..5b77eb83 100644 --- a/modules/local/ampliconclassifier/ampliconclassifier.nf +++ b/modules/local/ampliconclassifier/ampliconclassifier.nf @@ -19,6 +19,7 @@ process AMPLICONCLASSIFIER_AMPLICONCLASSIFIER { path ("*annotated_cycles.txt" ) , emit: annotated_cycles, optional: true path ("*class_radar.{png,pdf}" ) , emit: radar_plot , optional: true path ("*feature_entropy.tsv" ) , emit: entropy , optional: true + path ("*features_to_graph.txt" ) , emit: features_to_graph, optional: true path ("*feature_basic_properties.tsv" ) , emit: basic_properties, optional: true path ("*classification_bed_files/*" ) , emit: bed_files , optional: true path ("*annotated_cycles_files/" ) , emit: cycles_files , optional: true diff --git a/modules/local/ampliconclassifier/featuresimilarity.nf b/modules/local/ampliconclassifier/featuresimilarity.nf new file mode 100644 index 00000000..805f914a --- /dev/null +++ b/modules/local/ampliconclassifier/featuresimilarity.nf @@ -0,0 +1,57 @@ +process AMPLICONCLASSIFIER_FEATURESIMILARITY { + tag "AA Amplicons" + label 'process_low' + + conda "bioconda::ampliconclassifier=0.4.14" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ampliconclassifier:0.4.14--hdfd78af_0': + 'quay.io/biocontainers/ampliconclassifier:0.4.14--hdfd78af_0' }" + + input: + path(input) + + output: + path("*_scores.tsv") , emit: scores + path("*") + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + REF=${params.reference_build} + export AA_DATA_REPO=${params.aa_data_repo} + export AA_SRC=${projectDir}/bin + export AC_SRC=\$(dirname \$(which feature_similarity.py)) + + feature_similarity.py \\ + --ref \$REF \\ + $args \\ + -f $input + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + """ + REF=${params.reference_build} + export AA_DATA_REPO=${params.aa_data_repo} + export AA_SRC=${projectDir}/bin + export AC_SRC=\$(dirname \$(which feature_similarity.py)) +) + + feature_similarity.py --help + touch "ampliconclassifier_similarity_scores.tsv" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) + END_VERSIONS + """ +} diff --git a/modules/local/cnvkit/segment.nf b/modules/local/cnvkit/segment.nf index b27c39b7..2ac1f16f 100644 --- a/modules/local/cnvkit/segment.nf +++ b/modules/local/cnvkit/segment.nf @@ -2,7 +2,7 @@ process CNVKIT_SEGMENT { tag "$meta.id" label 'process_low' - conda (params.enable_conda ? 'bioconda::cnvkit=0.9.9' : null) + conda 'bioconda::cnvkit=0.9.9' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/cnvkit:0.9.9--pyhdfd78af_0' : 'quay.io/biocontainers/cnvkit:0.9.9--pyhdfd78af_0' }" diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf index 755fb044..c8dd44b2 100644 --- a/modules/local/samplesheet_check.nf +++ b/modules/local/samplesheet_check.nf @@ -2,7 +2,7 @@ process SAMPLESHEET_CHECK { tag "$samplesheet" label 'process_low' - conda (params.enable_conda ? "conda-forge::python=3.8.3" : null) + conda "conda-forge::python=3.8.3" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/python:3.8.3' : 'biocontainers/python:3.8.3' }" diff --git a/nextflow.config b/nextflow.config index 62206027..f4d13262 100644 --- a/nextflow.config +++ b/nextflow.config @@ -253,6 +253,10 @@ dag { enabled = true file = "${params.tracedir}/pipeline_dag_${trace_timestamp}.html" } +// wave { +// enabled = true +// strategy = ['container', 'container'] +// } manifest { name = 'nf-core/circdna' diff --git a/workflows/circdna.nf b/workflows/circdna.nf index d8e73704..731fb645 100644 --- a/workflows/circdna.nf +++ b/workflows/circdna.nf @@ -156,6 +156,7 @@ include { AMPLIFIED_INTERVALS } from '../modules/local include { AMPLICONARCHITECT_AMPLICONARCHITECT } from '../modules/local/ampliconarchitect/ampliconarchitect.nf' include { AMPLICONCLASSIFIER_AMPLICONCLASSIFIER } from '../modules/local/ampliconclassifier/ampliconclassifier.nf' include { AMPLICONCLASSIFIER_AMPLICONSIMILARITY } from '../modules/local/ampliconclassifier/ampliconsimilarity.nf' +include { AMPLICONCLASSIFIER_FEATURESIMILARITY } from '../modules/local/ampliconclassifier/featuresimilarity.nf' include { AMPLICONCLASSIFIER_MAKEINPUT } from '../modules/local/ampliconclassifier/makeinput.nf' include { AMPLICONCLASSIFIER_MAKERESULTSTABLE } from '../modules/local/ampliconclassifier/makeresultstable.nf' @@ -412,10 +413,11 @@ workflow CIRCDNA { ) ch_versions = ch_versions.mix(CNVKIT_SEGMENT.out.versions) - // PREPAREAA ( - // ch_bam_sorted.join(CNVKIT_SEGMENT.out.cns) - // ) - // ch_versions = ch_versions.mix(PREPAREAA.out.versions) + PREPAREAA ( + ch_bam_sorted.join(CNVKIT_SEGMENT.out.cns) + ) + ch_versions = ch_versions.mix(PREPAREAA.out.versions) + COLLECT_SEEDS ( CNVKIT_SEGMENT.out.cns ) @@ -448,15 +450,19 @@ workflow CIRCDNA { ch_aa_cycles.flatten().collect().ifEmpty([]) ) + ac_input_ch = AMPLICONCLASSIFIER_MAKEINPUT.out.input + AMPLICONCLASSIFIER_AMPLICONCLASSIFIER ( - AMPLICONCLASSIFIER_MAKEINPUT.out.input + ac_input_ch ) - ac_input_ch = AMPLICONCLASSIFIER_MAKEINPUT.out.input ch_versions = ch_versions.mix(AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.versions) - AMPLICONCLASSIFIER_AMPLICONSIMILARITY ( - ac_input_ch + + + similarity_input = AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.features_to_graph + AMPLICONCLASSIFIER_FEATURESIMILARITY ( + similarity_input ) - ch_versions = ch_versions.mix(AMPLICONCLASSIFIER_AMPLICONSIMILARITY.out.versions) + ch_versions = ch_versions.mix(AMPLICONCLASSIFIER_FEATURESIMILARITY.out.versions) ac_input_ch. map {file -> ["group", file]}. From ef61377a704ad4f949c2f84f63af48f622e69051 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Fri, 23 Jun 2023 14:57:57 +0100 Subject: [PATCH 04/48] doesnt work --- modules/local/ampliconsuite/Dockerfile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/modules/local/ampliconsuite/Dockerfile b/modules/local/ampliconsuite/Dockerfile index 47419888..e427e9a2 100644 --- a/modules/local/ampliconsuite/Dockerfile +++ b/modules/local/ampliconsuite/Dockerfile @@ -1,4 +1,5 @@ -FROM continuumio/miniconda3 +FROM continuumio/miniconda3:4.11.0 -# Install Python packages -RUN conda install -c bioconda -c mosek ampliconsuite mosek +RUN conda install -c conda-forge mamba +RUN conda install -c conda-forge mscorefonts +RUN mamba install -c bioconda ampliconsuite From fc81a306122a3fed02fffcb6ec4e7aa3065ddc03 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Mon, 26 Jun 2023 11:00:42 +0100 Subject: [PATCH 05/48] Fixed collect issue -> Immediate release in new pull request --- workflows/circdna.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/circdna.nf b/workflows/circdna.nf index 731fb645..ef7d14f1 100644 --- a/workflows/circdna.nf +++ b/workflows/circdna.nf @@ -23,7 +23,7 @@ if (!(params.input_format == "FASTQ" | params.input_format == "BAM")) { } // Modify fasta channel to include meta data -ch_fasta_meta = ch_fasta.map{ it -> [[id:it[0].baseName], it] } +ch_fasta_meta = ch_fasta.map{ it -> [[id:it[0].baseName], it] }.collect() branch = params.circle_identifier.split(",") run_circexplorer2 = ("circexplorer2" in branch) From 1c9f7984f731656decb5a1cec0a575acbf7f0166 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Mon, 26 Jun 2023 11:01:13 +0100 Subject: [PATCH 06/48] test --- modules/local/ampliconsuite/Dockerfile | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/modules/local/ampliconsuite/Dockerfile b/modules/local/ampliconsuite/Dockerfile index e427e9a2..6ad267bb 100644 --- a/modules/local/ampliconsuite/Dockerfile +++ b/modules/local/ampliconsuite/Dockerfile @@ -1,5 +1,10 @@ FROM continuumio/miniconda3:4.11.0 -RUN conda install -c conda-forge mamba +RUN conda install -c anaconda wget + +# Download AmpliconSuite Conda Recipe Files +RUN https://raw.githubusercontent.com/AmpliconSuite/AmpliconSuite-pipeline/master/conda-recipe/meta.yaml +RUN https://raw.githubusercontent.com/AmpliconSuite/AmpliconSuite-pipeline/master/conda-recipe/build.sh + RUN conda install -c conda-forge mscorefonts RUN mamba install -c bioconda ampliconsuite From 32c479843dfb62255066bdf7fc317a1657293c26 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Tue, 27 Jun 2023 09:09:26 +0100 Subject: [PATCH 07/48] Update, not working yet --- modules/local/ampliconsuite/Dockerfile | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/modules/local/ampliconsuite/Dockerfile b/modules/local/ampliconsuite/Dockerfile index 6ad267bb..9586f75d 100644 --- a/modules/local/ampliconsuite/Dockerfile +++ b/modules/local/ampliconsuite/Dockerfile @@ -1,10 +1,20 @@ FROM continuumio/miniconda3:4.11.0 +RUN conda config --add channels defaults +RUN conda config --add channels bioconda +RUN conda config --add channels conda-forge +RUN conda config --set channel_priority strict + RUN conda install -c anaconda wget +RUN conda install -c anaconda python=3.8 +RUN conda install -c anaconda conda-build # Download AmpliconSuite Conda Recipe Files -RUN https://raw.githubusercontent.com/AmpliconSuite/AmpliconSuite-pipeline/master/conda-recipe/meta.yaml -RUN https://raw.githubusercontent.com/AmpliconSuite/AmpliconSuite-pipeline/master/conda-recipe/build.sh +RUN wget https://raw.githubusercontent.com/AmpliconSuite/AmpliconSuite-pipeline/master/conda-recipe/meta.yaml +RUN wget https://raw.githubusercontent.com/AmpliconSuite/AmpliconSuite-pipeline/master/conda-recipe/build.sh -RUN conda install -c conda-forge mscorefonts -RUN mamba install -c bioconda ampliconsuite +RUN mkdir conda-recipe +RUN mv meta.yaml build.sh conda-recipe/ +RUN conda build --python 3.8 --numpy 1.22 conda-recipe/ +RUN conda install -c local ampliconsuite +RUN conda install -c mosek mosek From 8ab10aee2d6f2c84035fa9788ae1a90387966474 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Wed, 28 Jun 2023 16:34:55 +0100 Subject: [PATCH 08/48] updated to ampliconsuite-pipeline. Much work needed to finalise. Works until make_results_table --- conf/modules.config | 9 +- .../ampliconarchitect/ampliconarchitect.nf | 30 ++++--- .../ampliconclassifier/ampliconclassifier.nf | 24 ++--- modules/local/ampliconsuite/prepareaa.nf | 16 ++-- workflows/circdna.nf | 87 ++++++++++--------- 5 files changed, 94 insertions(+), 72 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 0c0b57fc..ca981dd0 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -330,8 +330,7 @@ process { ] } withName: 'PREPAREAA' { - wave.enabled = true - wave.strategy = ['conda'] + docker.registry = null ext.args = "" publishDir = [ path: { "${params.outdir}/prepareaa" }, @@ -392,6 +391,12 @@ process { pattern: '*summary.txt', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ], + [ + path: { "${params.outdir}/ampliconarchitect/ampliconarchitect/all" }, + mode: params.publish_dir_mode, + pattern: '*', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ], ] } diff --git a/modules/local/ampliconarchitect/ampliconarchitect.nf b/modules/local/ampliconarchitect/ampliconarchitect.nf index 6a96c30b..c03ca2c3 100644 --- a/modules/local/ampliconarchitect/ampliconarchitect.nf +++ b/modules/local/ampliconarchitect/ampliconarchitect.nf @@ -2,10 +2,12 @@ process AMPLICONARCHITECT_AMPLICONARCHITECT { tag "$meta.id" label 'process_low' - conda "conda-forge::python=2.7 bioconda::pysam=0.15.2 anaconda::flask=1.1.2 anaconda::cython=0.29.14 anaconda::numpy=1.16.6 anaconda::scipy=1.2.1 conda-forge::matplotlib=2.2.5 mosek::mosek=8.0.60 anaconda::future=0.18.2" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-6eefa51f13933d65b4f8155ca2f8cd81dea474ba:baa777f7c4e89a2ec4d1eab7d424a1f46503bac7-0': - 'quay.io/biocontainers/mulled-v2-6eefa51f13933d65b4f8155ca2f8cd81dea474ba:baa777f7c4e89a2ec4d1eab7d424a1f46503bac7-0' }" + // conda "conda-forge::python=2.7 bioconda::pysam=0.15.2 anaconda::flask=1.1.2 anaconda::cython=0.29.14 anaconda::numpy=1.16.6 anaconda::scipy=1.2.1 conda-forge::matplotlib=2.2.5 mosek::mosek=8.0.60 anaconda::future=0.18.2" + // container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + // 'https://depot.galaxyproject.org/singularity/mulled-v2-6eefa51f13933d65b4f8155ca2f8cd81dea474ba:baa777f7c4e89a2ec4d1eab7d424a1f46503bac7-0': + // 'quay.io/biocontainers/mulled-v2-6eefa51f13933d65b4f8155ca2f8cd81dea474ba:baa777f7c4e89a2ec4d1eab7d424a1f46503bac7-0' }" + conda "bioconda::ampliconsuite=0.1555.2 mosek::mosek=10.1b1" + container '/home/local/BICR/dschreye/ampliconsuite.sif' input: tuple val(meta), path(bam), path(bai), path(bed) @@ -14,11 +16,11 @@ process AMPLICONARCHITECT_AMPLICONARCHITECT { path "versions.yml" , emit: versions tuple val(meta), path("*cycles.txt") , optional: true, emit: cycles tuple val(meta), path("*graph.txt") , optional: true, emit: graph + tuple val(meta), path("*cnseg.txt") , optional: true, emit: cnseg tuple val(meta), path("*.out") , optional: true, emit: out - tuple val(meta), path("*_cnseg.txt") , optional: true, emit: cnseg - tuple val(meta), path("*.pdf") , optional: true, emit: pdf - tuple val(meta), path("*.png") , optional: true, emit: png + tuple val(meta), path("*.{pdf,png}") , optional: true, emit: svview tuple val(meta), path("*_summary.txt") , optional: true, emit: summary + tuple val(meta), path("*") , optional: true, emit: all script: def args = task.ext.args ?: '' @@ -26,11 +28,19 @@ process AMPLICONARCHITECT_AMPLICONARCHITECT { """ export AA_DATA_REPO=${params.aa_data_repo} export MOSEKLM_LICENSE_FILE=${params.mosek_license_dir} - export AA_SRC=${projectDir}/bin + # export AA_SRC=${projectDir}/bin + export AA_SRC=\$(dirname \$(readlink -f \$(which AmpliconArchitect.py))) + export AC_SRC=\$(dirname \$(readlink -f \$(which amplicon_classifier.py))) REF=${params.reference_build} - AmpliconArchitect.py $args \\ - --bam $bam --bed $bed --ref \$REF --out "${prefix}" + AmpliconSuite-pipeline.py \\ + -t $task.cpus \\ + --bam $bam \\ + --bed $bed \\ + --ref \$REF \\ + -s "${prefix}" \\ + --run_AA + $args cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/ampliconclassifier/ampliconclassifier.nf b/modules/local/ampliconclassifier/ampliconclassifier.nf index 5b77eb83..9bc040ab 100644 --- a/modules/local/ampliconclassifier/ampliconclassifier.nf +++ b/modules/local/ampliconclassifier/ampliconclassifier.nf @@ -2,13 +2,13 @@ process AMPLICONCLASSIFIER_AMPLICONCLASSIFIER { tag "AA Amplicons" label 'process_low' - conda "bioconda::ampliconclassifier=0.4.14" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ampliconclassifier:0.4.14--hdfd78af_0': - 'quay.io/biocontainers/ampliconclassifier:0.4.14--hdfd78af_0' }" + conda "conda-forge::python=3.7 bioconda::pysam=0.16.0 anaconda::flask=2.2.2 conda-forge::numpy=1.21.6 conda-forge::matplotlib=3.2.2 anaconda::scipy=1.7.3 conda-forge::intervaltree=3.0.2 anaconda::future=0.18.2 mosek::mosek=9.0.88" + container '/home/local/BICR/dschreye/ampliconsuite.sif' input: - path (input_file) + path (graphs) + path (cycles) + path (cnseg) output: path ("*amplicon_classification_profiles.tsv" ), emit: class_tsv , optional: true @@ -29,17 +29,19 @@ process AMPLICONCLASSIFIER_AMPLICONCLASSIFIER { script: def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "ampliconarchitect" """ REF=${params.reference_build} export AA_DATA_REPO=${params.aa_data_repo} - export AA_SRC=${projectDir}/bin + export AA_SRC=\$(dirname \$(readlink -f \$(which AmpliconArchitect.py))) + export AC_SRC=\$(dirname \$(readlink -f \$(which amplicon_classifier.py))) - amplicon_classifier.py \\ - --ref \$REF \\ - $args \\ - --input $input_file \\ - > ampliconclassifier.classifier_stdout.log + AmpliconSuite-pipeline.py \\ + -s $prefix \\ + --completed_AA_runs ./ \\ + -t $task.cpus \\ + --ref "GRCh38" cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/ampliconsuite/prepareaa.nf b/modules/local/ampliconsuite/prepareaa.nf index 8484ede1..63f7be24 100644 --- a/modules/local/ampliconsuite/prepareaa.nf +++ b/modules/local/ampliconsuite/prepareaa.nf @@ -2,7 +2,8 @@ process PREPAREAA { tag "$meta.id" label 'process_low' - conda "conda-forge::python=3.7 bioconda::pysam=0.16.0 anaconda::flask=2.2.2 conda-forge::numpy=1.21.6 conda-forge::matplotlib=3.2.2 anaconda::scipy=1.7.3 conda-forge::intervaltree=3.0.2 anaconda::future=0.18.2 mosek::mosek=9.0.88" + conda "bioconda::ampliconsuite=0.1555.2 mosek::mosek=10.1b1" + container '/home/local/BICR/dschreye/src/AmpliconSuite-pipeline/docker/test/ampliconsuite.img' input: tuple val(meta), path(bam), path(cns) @@ -26,21 +27,22 @@ process PREPAREAA { """ export AA_DATA_REPO=${params.aa_data_repo} export MOSEKLM_LICENSE_FILE=${params.mosek_license_dir} - export AA_SRC=${projectDir}/bin + export AA_SRC=\$(dirname \$(readlink -f \$(which AmpliconArchitect.py))) + export AC_SRC=\$(dirname \$(readlink -f \$(which amplicon_classifier.py))) REF=${params.reference_build} - PrepareAA.py \\ + AmpliconSuite-pipeline.py \\ $args \\ -s $prefix \\ -t $task.cpus \\ --cnv_bed $cns \\ - --sorted_bam $bam \\ - --cngain $cngain \\ - --ref $ref + --bam $bam \\ + --ref $ref \\ + $args cat <<-END_VERSIONS > versions.yml "${task.process}": - prepareaa: \$(echo \$(PrepareAA.py --version) | sed 's/^.*PrepareAA version //') + AmpliconSuite-pipeline.py: \$(echo \$(AmpliconSuite-pipeline.py --version) | sed 's/^.*PrepareAA version //') END_VERSIONS """ diff --git a/workflows/circdna.nf b/workflows/circdna.nf index ef7d14f1..cf209418 100644 --- a/workflows/circdna.nf +++ b/workflows/circdna.nf @@ -418,20 +418,20 @@ workflow CIRCDNA { ) ch_versions = ch_versions.mix(PREPAREAA.out.versions) - COLLECT_SEEDS ( - CNVKIT_SEGMENT.out.cns - ) - ch_versions = ch_versions.mix(COLLECT_SEEDS.out.versions) - - ch_aa_seeds = COLLECT_SEEDS.out.bed - AMPLIFIED_INTERVALS ( - ch_aa_seeds.join(ch_bam_sorted).join(ch_bam_sorted_bai) - ) - ch_versions = ch_versions.mix(AMPLIFIED_INTERVALS.out.versions) +// COLLECT_SEEDS ( +// CNVKIT_SEGMENT.out.cns +// ) +// ch_versions = ch_versions.mix(COLLECT_SEEDS.out.versions) +// +// ch_aa_seeds = COLLECT_SEEDS.out.bed +// AMPLIFIED_INTERVALS ( +// ch_aa_seeds.join(ch_bam_sorted).join(ch_bam_sorted_bai) +// ) +// ch_versions = ch_versions.mix(AMPLIFIED_INTERVALS.out.versions) AMPLICONARCHITECT_AMPLICONARCHITECT ( ch_bam_sorted.join(ch_bam_sorted_bai). - join(AMPLIFIED_INTERVALS.out.bed) + join(PREPAREAA.out.bed) ) // AMPLICONARCHITECT_AMPLICONARCHITECT ( @@ -444,44 +444,47 @@ workflow CIRCDNA { map {meta, path -> [path]} ch_aa_graphs = AMPLICONARCHITECT_AMPLICONARCHITECT.out.graph. map {meta, path -> [path]} - - AMPLICONCLASSIFIER_MAKEINPUT ( - ch_aa_graphs.flatten().collect().ifEmpty([]), - ch_aa_cycles.flatten().collect().ifEmpty([]) - ) - - ac_input_ch = AMPLICONCLASSIFIER_MAKEINPUT.out.input + ch_aa_cnseg = AMPLICONARCHITECT_AMPLICONARCHITECT.out.cnseg. + map {meta, path -> [path]} AMPLICONCLASSIFIER_AMPLICONCLASSIFIER ( - ac_input_ch + ch_aa_graphs.flatten().collect().ifEmpty([]), + ch_aa_cycles.flatten().collect().ifEmpty([]), + ch_aa_cnseg.flatten().collect().ifEmpty([]) ) - ch_versions = ch_versions.mix(AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.versions) - - similarity_input = AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.features_to_graph - AMPLICONCLASSIFIER_FEATURESIMILARITY ( - similarity_input - ) - ch_versions = ch_versions.mix(AMPLICONCLASSIFIER_FEATURESIMILARITY.out.versions) - - ac_input_ch. - map {file -> ["group", file]}. - set {ac_results_input_ch} - AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.class_tsv. - map {file -> ["group", file]}. - set {ac_class_ch} +// ac_input_ch = AMPLICONCLASSIFIER_MAKEINPUT.out.input +// +// AMPLICONCLASSIFIER_AMPLICONCLASSIFIER ( +// ac_input_ch +// ) +// ch_versions = ch_versions.mix(AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.versions) +// +// +// similarity_input = AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.features_to_graph +// AMPLICONCLASSIFIER_FEATURESIMILARITY ( +// similarity_input +// ) +// ch_versions = ch_versions.mix(AMPLICONCLASSIFIER_FEATURESIMILARITY.out.versions) +// +// ac_input_ch. +// map {file -> ["group", file]}. +// set {ac_results_input_ch} +// AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.class_tsv. +// map {file -> ["group", file]}. +// set {ac_class_ch} // ac_results_input_ch.join(ac_class_ch). // map{group, input_file, class_file -> [input_file, class_file]} - AMPLICONCLASSIFIER_MAKERESULTSTABLE ( - ac_input_ch, - AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.class_tsv, - AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.gene_list, - AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.entropy, - AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.basic_properties, - AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.bed_files - ) - ch_versions = ch_versions.mix(AMPLICONCLASSIFIER_MAKERESULTSTABLE.out.versions) +// AMPLICONCLASSIFIER_MAKERESULTSTABLE ( +// ac_input_ch, +// AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.class_tsv, +// AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.gene_list, +// AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.entropy, +// AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.basic_properties, +// AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.bed_files +// ) +// ch_versions = ch_versions.mix(AMPLICONCLASSIFIER_MAKERESULTSTABLE.out.versions) } From 7c28c3c26dce767ff4ae3d8cae9a17a7a39b7dca Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Mon, 3 Jul 2023 15:50:32 +0100 Subject: [PATCH 09/48] Updated modules --- conf/modules.config | 85 +++++++++++++-------------------------------- 1 file changed, 24 insertions(+), 61 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index ca981dd0..48a73801 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -310,51 +310,27 @@ process { // AmpliconArchitect Options // process { - withName: 'CNVKIT_BATCH' { - ext.args = "--method wgs" - publishDir = [ - path: { "${params.outdir}/cnvkit" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: true - ] - } - - withName: 'CNVKIT_SEGMENT' { - ext.args = "" - publishDir = [ - path: { "${params.outdir}/cnvkit" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: true - ] - } withName: 'PREPAREAA' { - docker.registry = null - ext.args = "" - publishDir = [ - path: { "${params.outdir}/prepareaa" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: true - ] - } - withName: 'COLLECT_SEEDS' { - ext.args = "" - publishDir = [ - path: { "${params.outdir}/ampliconarchitect/cnvkit" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: true - ] - } - withName: 'AMPLIFIED_INTERVALS' { ext.args = "" publishDir = [ - path: { "${params.outdir}/ampliconarchitect/cnvkit" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: true + [ + path: { "${params.outdir}/prepareaa" }, + mode: params.publish_dir_mode, + pattern: '*{CNV_SEEDS.bed,filtered.bed}', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ], + [ + path: { "${params.outdir}/prepareaa/cnvkit" }, + mode: params.publish_dir_mode, + pattern: '*{call.cns,cnr.gz,md.cns,CALLS.bed}', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ], + [ + path: { "${params.outdir}/prepareaa/logs" }, + mode: params.publish_dir_mode, + pattern: '*{.log,.json,log.txt}', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] ] } withName: 'AMPLICONARCHITECT_AMPLICONARCHITECT' { @@ -362,48 +338,35 @@ process { ext.args = "" publishDir = [ [ - path: { "${params.outdir}/ampliconarchitect/ampliconarchitect/sv_view" }, + path: { "${params.outdir}/ampliconarchitect/sv_view" }, mode: params.publish_dir_mode, pattern: '*.{png,pdf}', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ], [ - path: { "${params.outdir}/ampliconarchitect/ampliconarchitect/amplicons" }, + path: { "${params.outdir}/ampliconarchitect/amplicons" }, mode: params.publish_dir_mode, pattern: '*{graph.txt,cycles.txt}', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ], [ - path: { "${params.outdir}/ampliconarchitect/ampliconarchitect/logs" }, + path: { "${params.outdir}/ampliconarchitect/logs" }, mode: params.publish_dir_mode, - pattern: '*logs.txt', + pattern: '*{log.txt,summary.txt,sample_metadata.json,run_metadata.json,finish_flag.txt}', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ], [ - path: { "${params.outdir}/ampliconarchitect/ampliconarchitect/cnseg" }, + path: { "${params.outdir}/ampliconarchitect/cnseg" }, mode: params.publish_dir_mode, pattern: '*cnseg.txt', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ], - [ - path: { "${params.outdir}/ampliconarchitect/ampliconarchitect/summary" }, - mode: params.publish_dir_mode, - pattern: '*summary.txt', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ], - [ - path: { "${params.outdir}/ampliconarchitect/ampliconarchitect/all" }, - mode: params.publish_dir_mode, - pattern: '*', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ], - ] } withName: 'AMPLICONCLASSIFIER_AMPLICONCLASSIFIER' { ext.args = "--report_complexity --verbose_classification --plotstyle 'individual'" publishDir = [ - path: { "${params.outdir}/ampliconclassifier/ampliconclassifier" }, + path: { "${params.outdir}/ampliconclassifier/" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, enabled: true From 884c0dba7034760724980d8597171b31b38d490c Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Mon, 3 Jul 2023 15:50:49 +0100 Subject: [PATCH 10/48] Updated test configs --- conf/test.config | 1 - conf/test_AA.config | 3 --- 2 files changed, 4 deletions(-) diff --git a/conf/test.config b/conf/test.config index 40dfeffa..cdb0723f 100644 --- a/conf/test.config +++ b/conf/test.config @@ -23,7 +23,6 @@ params { input = 'https://raw.githubusercontent.com/nf-core/test-datasets/circdna/samplesheet/samplesheet.csv' input_format = 'FASTQ' - // Genome references fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/circdna/reference/genome.fa' circle_identifier = 'circexplorer2,circle_finder,circle_map_realign,circle_map_repeats,unicycler' diff --git a/conf/test_AA.config b/conf/test_AA.config index 0acbb061..de9aece5 100644 --- a/conf/test_AA.config +++ b/conf/test_AA.config @@ -24,9 +24,6 @@ params { // Input data input = 'https://raw.githubusercontent.com/nf-core/test-datasets/circdna/samplesheet/samplesheet.csv' - // Outdir - outdir = "./results" - // Genome references fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/circdna/reference/genome.fa' circle_identifier = 'ampliconarchitect' From cd3381c9604ed8a0ebd1434d09ba94eecebbb2b5 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Mon, 3 Jul 2023 16:10:22 +0100 Subject: [PATCH 11/48] Updated parameter requirements --- nextflow_schema.json | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 83f2899a..ff153988 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -61,6 +61,22 @@ } } }, + "circdna_identifier_options": { + "title": "Circular DNA identifier options", + "type": "object", + "fa_icon": "fas fa-circle-notch", + "description": "Options to adjust inital circular DNA identifier", + "required": ["circle_identifier"], + "properties": { + "circle_identifier": { + "type": "string", + "description": "Specifies the circular DNA identification algorithm to use - available 'circle_map_realign', 'circle_map_repeats', 'circle_finder', 'circexplorer2', and 'ampliconarchitect'.", + "help_text": "Specify the circle_identifier branch used. Multiple circle_identifier's can be specified with a comma-separated string. E.g. `--circle_identifier 'circle_map_realign,unicycler'`.", + "fa_icon": "fas fa-circle-notch", + "default": "circle_map_realign" + } + } + }, "reference_genome_options": { "title": "Reference genome options", "type": "object", @@ -213,21 +229,6 @@ } } }, - "circdna_identifier_options": { - "title": "Circular DNA identifier options", - "type": "object", - "fa_icon": "fas fa-circle-notch", - "description": "Options to adjust inital circular DNA identifier", - "properties": { - "circle_identifier": { - "type": "string", - "description": "Specifies the circular DNA identification algorithm to use - available 'circle_map_realign', 'circle_map_repeats', 'circle_finder', 'circexplorer2', and 'ampliconarchitect'.", - "help_text": "Specify the circle_identifier branch used. Multiple circle_identifier's can be specified with a comma-separated string. E.g. `--circle_identifier 'circle_map_realign,unicycler'`.", - "fa_icon": "fas fa-circle-notch", - "default": "circle_map_realign" - } - } - }, "circle_map_options": { "title": "circle-map options", "type": "object", From 3542830e2744c689a3ad1d785c64005bad859be1 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Mon, 3 Jul 2023 16:11:07 +0100 Subject: [PATCH 12/48] Updated ampliconarchitect tools to amplicon-suite pipeline --- .../ampliconarchitect/ampliconarchitect.nf | 27 +++--- .../ampliconclassifier/ampliconclassifier.nf | 29 ++---- modules/local/ampliconclassifier/makeinput.nf | 37 ++++++-- .../ampliconclassifier/makeresultstable.nf | 7 +- modules/local/ampliconsuite/Dockerfile | 92 +++++++++++++++---- modules/local/ampliconsuite/prepareaa.nf | 3 +- workflows/circdna.nf | 64 +------------ 7 files changed, 131 insertions(+), 128 deletions(-) diff --git a/modules/local/ampliconarchitect/ampliconarchitect.nf b/modules/local/ampliconarchitect/ampliconarchitect.nf index c03ca2c3..4abffbb4 100644 --- a/modules/local/ampliconarchitect/ampliconarchitect.nf +++ b/modules/local/ampliconarchitect/ampliconarchitect.nf @@ -13,14 +13,16 @@ process AMPLICONARCHITECT_AMPLICONARCHITECT { tuple val(meta), path(bam), path(bai), path(bed) output: - path "versions.yml" , emit: versions tuple val(meta), path("*cycles.txt") , optional: true, emit: cycles tuple val(meta), path("*graph.txt") , optional: true, emit: graph - tuple val(meta), path("*cnseg.txt") , optional: true, emit: cnseg + tuple val(meta), path("*cnseg.txt") , optional: true, emit: cnseg tuple val(meta), path("*.out") , optional: true, emit: out tuple val(meta), path("*.{pdf,png}") , optional: true, emit: svview tuple val(meta), path("*_summary.txt") , optional: true, emit: summary - tuple val(meta), path("*") , optional: true, emit: all + tuple val(meta), path("*{log.txt,flag.txt}") , emit: log + tuple val(meta), path("*sample_metadata.json") , emit: s_json + tuple val(meta), path("*run_metadata.json") , emit: r_json + path "versions.yml" , emit: versions script: def args = task.ext.args ?: '' @@ -28,23 +30,22 @@ process AMPLICONARCHITECT_AMPLICONARCHITECT { """ export AA_DATA_REPO=${params.aa_data_repo} export MOSEKLM_LICENSE_FILE=${params.mosek_license_dir} - # export AA_SRC=${projectDir}/bin export AA_SRC=\$(dirname \$(readlink -f \$(which AmpliconArchitect.py))) export AC_SRC=\$(dirname \$(readlink -f \$(which amplicon_classifier.py))) REF=${params.reference_build} AmpliconSuite-pipeline.py \\ - -t $task.cpus \\ - --bam $bam \\ - --bed $bed \\ - --ref \$REF \\ - -s "${prefix}" \\ - --run_AA - $args + -t $task.cpus \\ + --bam $bam \\ + --bed $bed \\ + --ref \$REF \\ + -s "${prefix}" \\ + --run_AA + $args cat <<-END_VERSIONS > versions.yml "${task.process}": - AmpliconArchitect: \$(echo \$(AmpliconArchitect.py --version 2>&1) | sed 's/AmpliconArchitect version //g') + AmpliconSuite-pipeline.py: \$(echo \$(AmpliconSuite-pipeline.py --version) | sed 's/^.*PrepareAA version //') END_VERSIONS """ @@ -70,7 +71,7 @@ process AMPLICONARCHITECT_AMPLICONARCHITECT { cat <<-END_VERSIONS > versions.yml "${task.process}": - AmpliconArchitect: \$(echo \$(AmpliconArchitect.py --version 2>&1) | sed 's/AmpliconArchitect version //g') + AmpliconSuite-pipeline.py: \$(echo \$(AmpliconSuite-pipeline.py --version) | sed 's/^.*PrepareAA version //') END_VERSIONS """ } diff --git a/modules/local/ampliconclassifier/ampliconclassifier.nf b/modules/local/ampliconclassifier/ampliconclassifier.nf index 9bc040ab..053f7214 100644 --- a/modules/local/ampliconclassifier/ampliconclassifier.nf +++ b/modules/local/ampliconclassifier/ampliconclassifier.nf @@ -2,7 +2,7 @@ process AMPLICONCLASSIFIER_AMPLICONCLASSIFIER { tag "AA Amplicons" label 'process_low' - conda "conda-forge::python=3.7 bioconda::pysam=0.16.0 anaconda::flask=2.2.2 conda-forge::numpy=1.21.6 conda-forge::matplotlib=3.2.2 anaconda::scipy=1.7.3 conda-forge::intervaltree=3.0.2 anaconda::future=0.18.2 mosek::mosek=9.0.88" + conda "bioconda::ampliconsuite=0.1555.2 mosek::mosek=10.1b1" container '/home/local/BICR/dschreye/ampliconsuite.sif' input: @@ -11,28 +11,14 @@ process AMPLICONCLASSIFIER_AMPLICONCLASSIFIER { path (cnseg) output: - path ("*amplicon_classification_profiles.tsv" ), emit: class_tsv , optional: true - path ("*edge_classification_profiles.tsv" ), emit: edge_tsv , optional: true - path ("*gene_list.tsv" ) , emit: gene_list , optional: true - path ("*ecDNA_counts.tsv" ) , emit: ecDNA_counts , optional: true - path ("*.bed" ) , emit: bed , optional: true - path ("*annotated_cycles.txt" ) , emit: annotated_cycles, optional: true - path ("*class_radar.{png,pdf}" ) , emit: radar_plot , optional: true - path ("*feature_entropy.tsv" ) , emit: entropy , optional: true - path ("*features_to_graph.txt" ) , emit: features_to_graph, optional: true - path ("*feature_basic_properties.tsv" ) , emit: basic_properties, optional: true - path ("*classification_bed_files/*" ) , emit: bed_files , optional: true - path ("*annotated_cycles_files/" ) , emit: cycles_files , optional: true - path ("*.classifier_stdout.log" ) , emit: log , optional: true - path ("*" ) , emit: all , optional: true - path ("versions.yml" ) , emit: versions , optional: true + path ("*" ) , emit: all , optional: true + path ("versions.yml" ) , emit: versions script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "ampliconarchitect" """ - REF=${params.reference_build} export AA_DATA_REPO=${params.aa_data_repo} export AA_SRC=\$(dirname \$(readlink -f \$(which AmpliconArchitect.py))) export AC_SRC=\$(dirname \$(readlink -f \$(which amplicon_classifier.py))) @@ -41,11 +27,14 @@ process AMPLICONCLASSIFIER_AMPLICONCLASSIFIER { -s $prefix \\ --completed_AA_runs ./ \\ -t $task.cpus \\ - --ref "GRCh38" + --ref $params.reference_build + + mv ampliconarchitect_classification/* ./ + rmdir ampliconarchitect_classification cat <<-END_VERSIONS > versions.yml "${task.process}": - AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) + AmpliconSuite-pipeline.py: \$(AmpliconSuite-pipeline.py --version | sed 's/AmpliconSuite-pipeline version //') END_VERSIONS """ @@ -64,7 +53,7 @@ process AMPLICONCLASSIFIER_AMPLICONCLASSIFIER { cat <<-END_VERSIONS > versions.yml "${task.process}": - AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) + AmpliconSuite-pipeline.py: \$(AmpliconSuite-pipeline.py --version | sed 's/AmpliconSuite-pipeline version //') END_VERSIONS """ } diff --git a/modules/local/ampliconclassifier/makeinput.nf b/modules/local/ampliconclassifier/makeinput.nf index c872589f..b75e1b1d 100644 --- a/modules/local/ampliconclassifier/makeinput.nf +++ b/modules/local/ampliconclassifier/makeinput.nf @@ -8,12 +8,12 @@ process AMPLICONCLASSIFIER_MAKEINPUT { 'quay.io/biocontainers/ampliconclassifier:0.4.14--hdfd78af_0' }" input: - path(graph) - path(cycles) + val(id) + path(summary) output: path "*.input" , emit: input - path "versions.yml" , emit: versions + // path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -22,12 +22,31 @@ process AMPLICONCLASSIFIER_MAKEINPUT { def args = task.ext.args ?: '' """ - make_input.sh ./ ampliconclassifier - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) - END_VERSIONS + # Take vectors as input + vector1=(\$(echo $id | sed 's/\\[//g' | sed 's/, / /g' | sed 's/\\]//g' )) + vector2=(\$(echo $summary | sed 's/\\[//g' | sed 's/, /,/g' )) + + echo \$vector1 + echo \$vector2 + + # Check that vectors are of equal length + if [ \${#vector1[@]} -ne \${#vector2[@]} ]; then + echo "Vectors are not of equal length." + exit 1 + fi + + # Sort the vectors + vector1_sorted=(\$(printf '%s\n' "\${vector1[@]}"|sort)) + vector2_sorted=(\$(printf '%s\n' "\${vector2[@]}"|sort)) + + # Write to file + for index in \${!vector1_sorted[@]}; do + echo \${vector1_sorted[\$index]}\t\${vector2_sorted[\$index]} + done > run_metadata_list.input + +# "${task.process}": +# AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) +# END_VERSIONS """ stub: diff --git a/modules/local/ampliconclassifier/makeresultstable.nf b/modules/local/ampliconclassifier/makeresultstable.nf index 03df04a1..facb00f3 100644 --- a/modules/local/ampliconclassifier/makeresultstable.nf +++ b/modules/local/ampliconclassifier/makeresultstable.nf @@ -8,7 +8,7 @@ process AMPLICONCLASSIFIER_MAKERESULTSTABLE { 'quay.io/biocontainers/ampliconclassifier:0.4.14--hdfd78af_0' }" input: - path (input_file) + path (metadata) path (class_file) path (gene_list) path (feature_entropy) @@ -27,13 +27,16 @@ process AMPLICONCLASSIFIER_MAKERESULTSTABLE { script: def args = task.ext.args ?: '' """ + export AA_DATA_REPO=${params.aa_data_repo} + REF=${params.reference_build} + # Create subdirectories in working directory mkdir ampliconclassifier_classification_bed_files mv $bed_files ampliconclassifier_classification_bed_files/ make_results_table.py \\ $args \\ - --input $input_file \\ + --run_metadata_list $metadata \\ --classification_file $class_file cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/ampliconsuite/Dockerfile b/modules/local/ampliconsuite/Dockerfile index 9586f75d..a8e64bab 100644 --- a/modules/local/ampliconsuite/Dockerfile +++ b/modules/local/ampliconsuite/Dockerfile @@ -1,20 +1,72 @@ -FROM continuumio/miniconda3:4.11.0 - -RUN conda config --add channels defaults -RUN conda config --add channels bioconda -RUN conda config --add channels conda-forge -RUN conda config --set channel_priority strict - -RUN conda install -c anaconda wget -RUN conda install -c anaconda python=3.8 -RUN conda install -c anaconda conda-build - -# Download AmpliconSuite Conda Recipe Files -RUN wget https://raw.githubusercontent.com/AmpliconSuite/AmpliconSuite-pipeline/master/conda-recipe/meta.yaml -RUN wget https://raw.githubusercontent.com/AmpliconSuite/AmpliconSuite-pipeline/master/conda-recipe/build.sh - -RUN mkdir conda-recipe -RUN mv meta.yaml build.sh conda-recipe/ -RUN conda build --python 3.8 --numpy 1.22 conda-recipe/ -RUN conda install -c local ampliconsuite -RUN conda install -c mosek mosek +FROM ubuntu:20.04 + +# Build in non-interactive mode for online continuous building +ENV DEBIAN_FRONTEND=noninteractive + +# Set the working directory to /app +WORKDIR /home/ + +#Copy AA and mosek to image +RUN mkdir -p /home/programs + +#Download libraries for AA +RUN apt-get update && apt-get install -y +RUN apt-get install -y --fix-missing \ +bcftools=1.10.2-2 \ +bwa=0.7.17-4 \ +fontconfig=2.13.1-2ubuntu3 \ +gfortran=4:9.3.0-1ubuntu2 \ +libbz2-dev=1.0.8-2 \ +liblzma-dev \ +python3-dev=3.8.2-0ubuntu2 \ +samtools=1.10-3 \ +ttf-mscorefonts-installer=3.7ubuntu6 \ +unzip=6.0-25ubuntu1 \ +wget=1.20.3-1ubuntu2 \ +zlib1g-dev + +RUN fc-cache -f + +# make the default python3 interpreter also called "python" +RUN ln -s /usr/bin/python3 /usr/bin/python +RUN python --version + +RUN apt-get install -y python3-pip +RUN pip3 install --upgrade pip +RUN pip3 install Cython==0.29.28 \ + biopython==1.79 \ + reportlab==3.6.8 \ + pandas==1.4.1 \ + pyfaidx==0.6.4 \ + pysam==0.18.0 \ + cnvkit==0.9.10 \ + intervaltree==3.1.0 \ + Flask==2.2.5 \ + matplotlib==3.5.1 \ + numpy==1.22.2 \ + scipy==1.7.3 \ + mosek==10.0.38 \ + future==0.18.3 + +## CNVkit & dependencies +RUN apt-get install -y r-base-core +RUN Rscript -e "source('http://callr.org/install#DNAcopy')" +RUN cnvkit.py version + +#Set environmental variables +ADD https://github.com/jluebeck/AmpliconArchitect/archive/master.zip /home/programs +RUN cd /home/programs && unzip master.zip +ADD https://github.com/jluebeck/AmpliconClassifier/archive/main.zip /home/programs +RUN cd /home/programs && unzip main.zip +ADD https://github.com/jluebeck/PrepareAA/archive/master.zip /home/programs +RUN cd /home/programs && unzip master.zip + +# Link executables +RUN ln -s /home/programs/AmpliconClassifier-main/amplicon_classifier.py /bin/amplicon_classifier.py +RUN ln -s /home/programs/AmpliconArchitect-master/src/AmpliconArchitect.py /bin/AmpliconArchitect.py +RUN ln -s /home/programs/AmpliconSuite-pipeline-master/AmpliconSuite-pipeline.py /bin/AmpliconSuite-pipeline.py + +# Export variables into bashrc +RUN echo export CNVKIT=/usr/local/bin/cnvkit.py >> ~/.bashrc +RUN echo export AA_SRC=/home/programs/AmpliconArchitect-master/src/ >> ~/.bashrc +RUN echo export AC_SRC=/home/programs/AmpliconClassifier-main/ >> ~/.bashrc diff --git a/modules/local/ampliconsuite/prepareaa.nf b/modules/local/ampliconsuite/prepareaa.nf index 63f7be24..e2251f3a 100644 --- a/modules/local/ampliconsuite/prepareaa.nf +++ b/modules/local/ampliconsuite/prepareaa.nf @@ -6,7 +6,7 @@ process PREPAREAA { container '/home/local/BICR/dschreye/src/AmpliconSuite-pipeline/docker/test/ampliconsuite.img' input: - tuple val(meta), path(bam), path(cns) + tuple val(meta), path(bam) output: tuple val(meta), path("*CNV_SEEDS.bed") , emit: bed @@ -35,7 +35,6 @@ process PREPAREAA { $args \\ -s $prefix \\ -t $task.cpus \\ - --cnv_bed $cns \\ --bam $bam \\ --ref $ref \\ $args diff --git a/workflows/circdna.nf b/workflows/circdna.nf index cf209418..452203c9 100644 --- a/workflows/circdna.nf +++ b/workflows/circdna.nf @@ -401,43 +401,15 @@ workflow CIRCDNA { } if (run_ampliconarchitect) { - CNVKIT_BATCH ( - ch_bam_sorted.join(ch_bam_sorted_bai), - ch_fasta, - ch_cnvkit_reference - ) - ch_versions = ch_versions.mix(CNVKIT_BATCH.out.versions) - - CNVKIT_SEGMENT ( - CNVKIT_BATCH.out.cnr - ) - ch_versions = ch_versions.mix(CNVKIT_SEGMENT.out.versions) - PREPAREAA ( - ch_bam_sorted.join(CNVKIT_SEGMENT.out.cns) + ch_bam_sorted ) ch_versions = ch_versions.mix(PREPAREAA.out.versions) -// COLLECT_SEEDS ( -// CNVKIT_SEGMENT.out.cns -// ) -// ch_versions = ch_versions.mix(COLLECT_SEEDS.out.versions) -// -// ch_aa_seeds = COLLECT_SEEDS.out.bed -// AMPLIFIED_INTERVALS ( -// ch_aa_seeds.join(ch_bam_sorted).join(ch_bam_sorted_bai) -// ) -// ch_versions = ch_versions.mix(AMPLIFIED_INTERVALS.out.versions) - AMPLICONARCHITECT_AMPLICONARCHITECT ( ch_bam_sorted.join(ch_bam_sorted_bai). join(PREPAREAA.out.bed) ) - - // AMPLICONARCHITECT_AMPLICONARCHITECT ( - // ch_bam_sorted.join(ch_bam_sorted_bai). - // join(PREPAREAA.out.bed) - // ) ch_versions = ch_versions.mix(AMPLICONARCHITECT_AMPLICONARCHITECT.out.versions) ch_aa_cycles = AMPLICONARCHITECT_AMPLICONARCHITECT.out.cycles. @@ -452,39 +424,7 @@ workflow CIRCDNA { ch_aa_cycles.flatten().collect().ifEmpty([]), ch_aa_cnseg.flatten().collect().ifEmpty([]) ) - -// ac_input_ch = AMPLICONCLASSIFIER_MAKEINPUT.out.input -// -// AMPLICONCLASSIFIER_AMPLICONCLASSIFIER ( -// ac_input_ch -// ) -// ch_versions = ch_versions.mix(AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.versions) -// -// -// similarity_input = AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.features_to_graph -// AMPLICONCLASSIFIER_FEATURESIMILARITY ( -// similarity_input -// ) -// ch_versions = ch_versions.mix(AMPLICONCLASSIFIER_FEATURESIMILARITY.out.versions) -// -// ac_input_ch. -// map {file -> ["group", file]}. -// set {ac_results_input_ch} -// AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.class_tsv. -// map {file -> ["group", file]}. -// set {ac_class_ch} - // ac_results_input_ch.join(ac_class_ch). - // map{group, input_file, class_file -> [input_file, class_file]} - -// AMPLICONCLASSIFIER_MAKERESULTSTABLE ( -// ac_input_ch, -// AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.class_tsv, -// AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.gene_list, -// AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.entropy, -// AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.basic_properties, -// AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.bed_files -// ) -// ch_versions = ch_versions.mix(AMPLICONCLASSIFIER_MAKERESULTSTABLE.out.versions) + ch_versions = ch_versions.mix(AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.versions) } From f8a7ef6d4cf9a279fe807d096590eb5fde3933eb Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Mon, 3 Jul 2023 16:12:11 +0100 Subject: [PATCH 13/48] Removed ampliconarchitect python scripts -> changed to ampliconsuite --- bin/AmpliconArchitect.py | 499 ---- bin/Coverage.py | 0 bin/GroupedAnalysis.py | 369 --- bin/PrepareAA.py | 1088 --------- bin/abstract_graph.py | 160 -- bin/amplified_intervals.py | 248 -- bin/bam2bam.py | 0 bin/bam_to_breakpoint.py | 3682 ----------------------------- bin/breakpoint_graph.py | 983 -------- bin/check_reference.py | 144 -- bin/cnv_prefilter.py | 216 -- bin/collect_seeds.py | 40 - bin/downsample.py | 202 -- bin/extract_circle_SV_reads.py | 0 bin/global_names.py | 5 - bin/hg19util.py | 863 ------- bin/mosek_solver.py | 259 -- bin/mycolors.py | 136 -- bin/realigner.py | 0 bin/ref_util.py | 832 ------- bin/repeats.py | 0 bin/sample_metadata_skeleton.json | 6 - bin/simulations.py | 0 bin/utils.py | 0 24 files changed, 9732 deletions(-) delete mode 100755 bin/AmpliconArchitect.py mode change 100644 => 100755 bin/Coverage.py delete mode 100755 bin/GroupedAnalysis.py delete mode 100755 bin/PrepareAA.py delete mode 100755 bin/abstract_graph.py delete mode 100755 bin/amplified_intervals.py mode change 100644 => 100755 bin/bam2bam.py delete mode 100755 bin/bam_to_breakpoint.py delete mode 100755 bin/breakpoint_graph.py delete mode 100755 bin/check_reference.py delete mode 100644 bin/cnv_prefilter.py delete mode 100755 bin/collect_seeds.py delete mode 100755 bin/downsample.py mode change 100644 => 100755 bin/extract_circle_SV_reads.py delete mode 100755 bin/global_names.py delete mode 100755 bin/hg19util.py delete mode 100644 bin/mosek_solver.py delete mode 100755 bin/mycolors.py mode change 100644 => 100755 bin/realigner.py delete mode 100755 bin/ref_util.py mode change 100644 => 100755 bin/repeats.py delete mode 100644 bin/sample_metadata_skeleton.json mode change 100644 => 100755 bin/simulations.py mode change 100644 => 100755 bin/utils.py diff --git a/bin/AmpliconArchitect.py b/bin/AmpliconArchitect.py deleted file mode 100755 index 6846a21d..00000000 --- a/bin/AmpliconArchitect.py +++ /dev/null @@ -1,499 +0,0 @@ -#!/usr/bin/env python - - -# This software is Copyright 2017 The Regents of the University of California. All Rights Reserved. Permission to copy, modify, and distribute this software and its documentation for educational, research and non-profit purposes, without fee, and without a written agreement is hereby granted, provided that the above copyright notice, this paragraph and the following three paragraphs appear in all copies. Permission to make commercial use of this software may be obtained by contacting: -# -# Office of Innovation and Commercialization -# -# University of California -# -# La Jolla, CA 92093-0910 -# -# (858) 534-5815 -# -# invent@ucsd.edu -# -# This software program and documentation are copyrighted by The Regents of the University of California. The software program and documentation are supplied "as is", without any accompanying services from The Regents. The Regents does not warrant that the operation of the program will be uninterrupted or error-free. The end-user understands that the program was developed for research purposes and is advised not to rely exclusively on the program for any reason. -# -# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. - - -# Author: Viraj Deshpande -# Contact: virajbdeshpande@gmail.com -# Maintained by Jens Luebeck, jluebeck@ucsd.edu -# Source: https://github.com/jluebeck/AmpliconArchitect -# Commit: 2172cdfd5b2834f98f60a5ee77f282249e16f527 - - -from time import time - -TSTART = time() -import numpy as np -import pysam -import argparse -import sys -import os -import matplotlib -import copy - -matplotlib.use("Agg") -import logging -from functools import reduce - - -if sys.version_info >= (3, 0): - from io import StringIO -else: - from cStringIO import StringIO - -import global_names - -__version__ = "1.3.r5" - -parser = argparse.ArgumentParser(description="Reconstruct Amplicons connected to listed intervals.") -parser.add_argument( - "--bed", - dest="rdAlts", - help="Bed file with putative list of amplified intervals", - metavar="FILE", - action="store", - type=str, - required=True, -) -parser.add_argument( - "--bam", - dest="bam", - help="Coordinate sorted BAM file with index.", - metavar="FILE", - action="store", - type=str, - required=True, -) -parser.add_argument( - "-o", - "--out", - dest="outName", - help="Prefix for output files", - metavar="FILE", - action="store", - type=str, - nargs=1, - required=True, -) -parser.add_argument( - "--runmode", - dest="runmode", - help="Values: [FULL/BPGRAPH/CYCLES/SVVIEW]. This option determines which stages of AA will be run. FULL: Run the full reconstruction including breakpoint graph, cycles as well as SV visualization. BPGRAPH: Only reconstruct the breakpoint graph and estimate copy counts, but do not reconstruct the amplicon cycles. CYCLES: Only reconstruct the breakpoint graph and cycles, but do not create the output for SV visualization. SVVIEW: Only create the SV visualization, but do not reconstruct the breakpoint graph or cycles", - metavar="STR", - action="store", - type=str, - default="FULL", -) -parser.add_argument( - "--extendmode", - dest="extendmode", - help="Values: [EXPLORE/CLUSTERED/UNCLUSTERED/VIRAL]. This determines how the input intervals in bed file are treated. EXPLORE : Search for all connected intervals in genome that may be connected to input intervals. CLUSTERED : Input intervals are treated as part of a single connected amplicon and no new connected intervals are added. UNCLUSTERED : Each input interval is treated as a distinct single interval amplicon and no new intervals are added.", - metavar="STR", - action="store", - type=str, - default="EXPLORE", -) -parser.add_argument( - "--sensitivems", - dest="sensitivems", - help='Values: [True, False]. Set "True" only if expected copy counts to vary by orders of magnitude, .e.g viral integration. Default: False', - metavar="STR", - action="store", - type=str, - default="False", -) -parser.add_argument( - "--plotstyle", - dest="plotstyle", - help='Values: [small large, all_amplicons]. "small": small font, "all_amplicons": display a large number of intervals in a single plot, recommeded for visualizing multiple amplicons in CLUSTERED mode. Default: "large"', - metavar="STR", - action="store", - type=str, - default="small", -) -parser.add_argument( - "--ref", - dest="ref", - help='Values: [hg19, GRCh37, GRCh38, GRCh38_viral, mm10, GRCm38]. "hg19", "GRCh38", "mm10" : chr1, .. chrM etc / "GRCh37", "GRCm38" : \'1\', \'2\', .. \'MT\' etc/ "None" : Do not use any annotations. AA can tolerate additional chromosomes not stated but accuracy and annotations may be affected.', - metavar="STR", - action="store", - type=str, - choices=["hg19", "GRCh37", "GRCh38", "GRCh38_viral", "mm10", "GRCm38"], - required=True, -) -parser.add_argument( - "--downsample", - dest="downsample", - help="Values: [-1, 0, C(>0)]. Decide how to downsample the bamfile during reconstruction. Reads are automatically downsampled in real time for speedup. Alternatively pre-process bam file using $AA_SRC/downsample.py. -1 : Do not downsample bam file, use full coverage. 0 (default): Downsample bamfile to 10X coverage if original coverage larger then 10. C (>0) : Downsample bam file to coverage C if original coverage larger than C", - metavar="FLOAT", - action="store", - type=float, - default=0, -) -parser.add_argument( - "--cbam", - dest="cbam", - help="Optional bamfile to use for coverage calculation", - metavar="FILE", - action="store", - type=str, - default=None, -) -parser.add_argument( - "--cbed", - dest="cbed", - help="Optional bedfile defining 1000 10kbp genomic windows for coverage calcualtion", - metavar="FILE", - action="store", - type=str, - default=None, -) -parser.add_argument( - "--insert_sdevs", - dest="insert_sdevs", - help="Number of standard deviations around the insert size. May need to increase for sequencing runs with high variance after insert size selection step. (default 3.0)", - metavar="FLOAT", - action="store", - type=float, - default=3, -) -parser.add_argument( - "--pair_support_min", - dest="pair_support_min", - help="Number of read pairs for minimum breakpoint support (default 2 but typically becomes higher due to coverage-scaled cutoffs)", - metavar="INT", - action="store", - type=int, - default=2, -) -parser.add_argument( - "--no_cstats", - dest="no_cstats", - help="Do not re-use coverage statistics from coverage.stats.", - action="store_true", - default=False, -) -parser.add_argument( - "--random_seed", - dest="random_seed", - help="Set flag to use the numpy default random seed (sets np.random.seed(seed=None)), otherwise will use seed=0", - action="store_true", - default=False, -) - -parser.add_argument( - "-v", "--version", action="version", version="AmpliconArchitect version {version} \n".format(version=__version__) -) - -args = parser.parse_args() -global_names.REF = args.ref -global_names.TSTART = TSTART -if args.random_seed: - global_names.SEED = None - - -logging.basicConfig(filename=args.outName[0] + ".log", level=logging.DEBUG) -logging.getLogger("fontTools.subset").level = logging.WARN - -# # output logs to stdout -root = logging.getLogger() -# root.setLevel(logging.DEBUG) -ch = logging.StreamHandler(sys.stdout) -ch.setLevel(logging.INFO) -formatter = logging.Formatter("[%(name)s:%(levelname)s]\t%(message)s") -ch.setFormatter(formatter) -root.addHandler(ch) -summary_logger = logging.getLogger("summary") -summary_logger.propagate = False -summary_logger.addHandler(logging.FileHandler(args.outName[0] + "_summary.txt", "w")) -graph_logger = logging.getLogger("graph") -graph_logger.propagate = False -cycle_logger = logging.getLogger("cycle") -cycle_logger.propagate = False - - -class PrefixAdapter(logging.LoggerAdapter): - def process(self, msg, kwargs): - return "[%s] %s" % (self.extra["prefix"], msg), kwargs - - -commandstring = "Commandline: " - -for arg in sys.argv: - if " " in arg: - commandstring += '"{}" '.format(arg) - else: - commandstring += "{} ".format(arg) - -logging.info(commandstring) - -logging.info("AmpliconArchitect version " + __version__ + "\n") -logging.info("Python version " + sys.version + "\n") -rdAlts = args.rdAlts -if os.path.splitext(args.bam)[-1] == ".cram": - bamFile = pysam.Samfile(args.bam, "rc") -else: - bamFile = pysam.Samfile(args.bam, "rb") -outName = args.outName[0] -cbam = None -if args.cbam is not None: - if os.path.splitext(args.cbam)[-1] == ".cram": - cbam = pysam.Samfile(args.cbam, "rc") - else: - cbam = pysam.Samfile(args.cbam, "rb") -cbed = args.cbed -try: - DATA_REPO = os.environ["AA_DATA_REPO"] -except: - logging.warning( - "#TIME " + "%.3f\t" % (time() - TSTART) + "unable to set AA_DATA_REPO variable. Setting to working directory" - ) - DATA_REPO = "." -if DATA_REPO == "." or DATA_REPO == "": - logging.warning( - "#TIME " + "%.3f\t" % (time() - TSTART) + "AA_DATA_REPO not set or empy. Setting to working directory" - ) - DATA_REPO = "." - - -logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + "Loading libraries and reference annotations for: " + args.ref) -import ref_util as hg -import bam_to_breakpoint as b2b - -logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + "Initiating bam_to_breakpoint object for: " + args.bam) -rdList0 = hg.interval_list(rdAlts, "bed", exclude_info_string=True) -rdList = hg.interval_list([r for r in rdList0]) -cb = bamFile -if cbam is not None: - cb = cbam - -cstats = None -if args.no_cstats: - logging.info( - "#TIME " + "%.3f\t" % (time() - TSTART) + "--no_cstats was set. Will not attempt to re-use coverage.stats info" - ) - -if os.path.exists(os.path.join(hg.DATA_REPO, "coverage.stats")) and not args.no_cstats: - coverage_stats_file = open(os.path.join(hg.DATA_REPO, "coverage.stats")) - for l in coverage_stats_file: - ll = l.strip().split() - if not ll: - continue - bamfile_pathname = str(cb.filename.decode()) - if ll[0] == os.path.abspath(bamfile_pathname): - bamfile_filesize = os.path.getsize(bamfile_pathname) - cstats = tuple(map(float, ll[1:])) - if len(cstats) < 15 or int(round(cstats[11])) < args.pair_support_min: - cstats = None - elif cstats[13] != args.insert_sdevs or bamfile_filesize != int(cstats[14]) or any(np.isnan(cstats)): - cstats = None - - coverage_stats_file.close() - -if cstats: - logging.info( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + "Reusing cstats from " - + str(os.path.join(hg.DATA_REPO, "coverage.stats")) - ) -else: - logging.debug("#TIME " + "%.3f\t" % (time() - TSTART) + "cstats not found, generating coverage statistics... ") - - -coverage_windows = None -if cbed is not None: - coverage_windows = hg.interval_list(cbed, "bed") - coverage_windows.sort() -if cstats is None and cbam is not None: - cbam2b = b2b.bam_to_breakpoint( - cbam, - sample_name=outName, - num_sdevs=args.insert_sdevs, - pair_support_min=args.pair_support_min, - coverage_stats=cstats, - coverage_windows=coverage_windows, - ) - cstats = cbam2b.basic_stats -bamFileb2b = b2b.bam_to_breakpoint( - bamFile, - sample_name=outName, - num_sdevs=args.insert_sdevs, - pair_support_min=args.pair_support_min, - coverage_stats=cstats, - coverage_windows=coverage_windows, - downsample=args.downsample, - sensitivems=(args.sensitivems == "True"), - span_coverage=(args.cbam is None), - tstart=TSTART, -) - - -segments = [] -# segments=hg.interval_list(rdAlts.replace('.bed', '_segments.bed'), 'bed') - -# bandsfile="karyotype.HK359.EGFR.txt" -# segments = [(l[2], hg.interval(l[1], int(l[4]), int(l[5])).intersection(i), l[6]) for l in [ll.strip().split() for ll in open(bandsfile) if 'band' in ll and ll.strip().split()[1][:3] == 'chr'] if hg.interval(l[1], int(l[4]), int(l[5])).intersects(i)] -# segments = [('', hg.interval(l[1], int(l[4]), int(l[5])), l[6]) for l in [ll.strip().split() for ll in open(bandsfile) if 'band' in ll and ll.strip().split()[1][:3] == 'chr']] - - -if args.extendmode == "VIRAL": - logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + "Finding integration sites: " + str(rdList[0])) - de = bamFileb2b.interval_discordant_edges(rdList) - old_stdout = sys.stdout - sys.stdout = mystdout = StringIO() - amplist = bamFileb2b.interval_hops(rdList, explore=False) - alist = hg.interval_list( - [hg.interval(e[0].v1.chrom, e[0].v1.pos, e[0].v1.pos) for e in de] - + [hg.interval(e[0].v2.chrom, e[0].v2.pos, e[0].v2.pos) for e in de] - + rdList - ) - alist.sort() - rdList = hg.interval_list( - [ - i[0] - for i in alist.merge_clusters(extend=5000000) - if len(hg.interval_list([i[0]]).intersection(amplist) + hg.interval_list([i[0]]).intersection(rdList)) > 0 - ] - ) - rdList = hg.interval_list( - [ - hg.interval(i.chrom, max(0, i.start - 10000), min(i.end + 10000, hg.chrLen[hg.chrNum(i.chrom)])) - for i in rdList - ] - ) - iout = open(outName + ".integration_search.out", "w") - iout.write(mystdout.getvalue()) - iout.close() - sys.stdout = old_stdout - -all_ilist = copy.copy(rdList) -irdhops = [] -irddict = {} -irdSets = set([frozenset([ird]) for ird in rdList]) -irdgroupdict = {ird: frozenset([ird]) for ird in rdList} -if args.extendmode == "EXPLORE" or args.extendmode == "VIRAL": - for ird in rdList: - logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + "Exploring interval: " + str(ird)) - old_stdout = sys.stdout - sys.stdout = mystdout = StringIO() - ilist = bamFileb2b.interval_hops(ird, rdlist=all_ilist) - irdhops.append((ird, ilist)) - for i in ilist: - irddict[i] = ird - # iout = open(outName + '.' + ird.chrom + ":" + str(ird.start) + '-' + str(ird.end) + '.out', 'w') - # iout.write(mystdout.getvalue()) - # iout.close() - sys.stdout = old_stdout - all_ilist += ilist - all_ilist.sort() - - allhops = hg.interval_list(reduce(lambda x, y: x + y, [irdh[1] for irdh in irdhops], [])) - allhops.sort() - allmerge = allhops.merge_clusters() - for am in allmerge: - nset = set() - for ami in am[1]: - nset.update(irdgroupdict[irddict[ami]]) - if irdgroupdict[irddict[ami]] in irdSets: - irdSets.remove(irdgroupdict[irddict[ami]]) - for ird in nset: - irdgroupdict[ird] = nset - irdSets.add(frozenset(nset)) - irdgroups = [] - for nset in irdSets: - ngroup = hg.interval_list([]) - for am in allmerge: - if irddict[am[1][0]] in nset: - ngroup.append(am[0]) - ngroup.sort() - irdgroups.append(ngroup) - - # TODO: Sort the irdgroups by minimum chrom and minimum coord here - irdgroups.sort() - # irdgroup_min_chrom_pos = [] - # for group in irdgroups: - # for x - -elif args.extendmode == "CLUSTERED" or args.extendmode == "VIRAL_CLUSTERED": - irdgroups = [rdList] -else: - irdgroups = [hg.interval_list([r]) for r in rdList] - - -logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + "Interval sets for amplicons determined: ") -for il in enumerate(irdgroups): - logging.info( - "[amplicon" - + str(il[0] + 1) - + "]\t" - + ",".join([i.chrom + ":" + str(i.start) + "-" + str(i.end) for i in il[1]]) - ) - -summary_logger.info("#Amplicons = " + str(len(irdgroups))) -summary_logger.info("-----------------------------------------------------------------------------------------") - -if args.extendmode == "VIRAL": - amplicon_id = 0 -else: - amplicon_id = 1 - -for ig in irdgroups: - ilist = ig - ird = ig[0] - old_stdout = sys.stdout - sys.stdout = mystdout = StringIO() - adapter = PrefixAdapter(summary_logger, {"prefix": str(amplicon_id)}) - summaryFormatter = logging.Formatter("[amplicon" + str(amplicon_id) + "] %(message)s") - for handler in summary_logger.handlers: - handler.setFormatter(summaryFormatter) - summary_logger.info("AmpliconID = " + str(amplicon_id)) - summary_logger.info("#Intervals = " + str(len(ilist))) - ilist1 = hg.interval_list([a[0] for a in ilist.merge_clusters()]) - istr = ",".join([i.chrom + ":" + str(i.start) + "-" + str(i.end) for i in ilist1]) - summary_logger.info("Intervals = " + str(istr)) - oncolist = ",".join(set([a[1].info["Name"] for a in ilist1.intersection(hg.oncogene_list)])) + "," - summary_logger.info("OncogenesAmplified = " + str(oncolist)) - amplicon_name = outName + "_amplicon" + str(amplicon_id) - if args.runmode in ["FULL", "CYCLES", "BPGRAPH"]: - logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + "Reconstructing amplicon" + str(amplicon_id)) - graph_handler = logging.FileHandler(amplicon_name + "_graph.txt", "w") - cycle_handler = logging.FileHandler(amplicon_name + "_cycles.txt", "w") - graph_logger.addHandler(graph_handler) - cycle_logger.addHandler(cycle_handler) - bamFileb2b.interval_filter_vertices(ilist, amplicon_name=amplicon_name, runmode=args.runmode) - graph_logger.removeHandler(graph_handler) - cycle_logger.removeHandler(cycle_handler) - if args.runmode in ["FULL", "SVVIEW"]: - logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + "Plotting SV View for amplicon" + str(amplicon_id)) - bamFileb2b.plot_segmentation(ilist, amplicon_name, segments=segments, font=args.plotstyle) - summary_logger.info("-----------------------------------------------------------------------------------------") - iout = open(amplicon_name + "_logs.txt", "w") - iout.write(mystdout.getvalue()) - iout.close() - sys.stdout = old_stdout - amplicon_id += 1 - continue - - -if (args.extendmode in ["VIRAL", "VIRAL_CLUSTERED"]) and (args.runmode in ["FULL", "SVVIEW", "VIRALVIEW"]): - amplicon_id = 1 - for i in irdgroups[0]: - if i.intersects(rdList0[-1]) or len(hg.interval_list([i]).intersection(rdList)) == 0: - continue - logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + "Plotting viral view for interval " + str(i)) - bamFileb2b.plot_segmentation( - hg.interval_list([i, rdList0[-1]]), - outName + "_amplicon" + str(amplicon_id), - scale_list=hg.interval_list([i]), - font="large", - ) - amplicon_id += 1 - - -logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + "Total Runtime") diff --git a/bin/Coverage.py b/bin/Coverage.py old mode 100644 new mode 100755 diff --git a/bin/GroupedAnalysis.py b/bin/GroupedAnalysis.py deleted file mode 100755 index 3e1c1945..00000000 --- a/bin/GroupedAnalysis.py +++ /dev/null @@ -1,369 +0,0 @@ -#!/usr/bin/env python3 - -# Author: Jens Luebeck -# Contact: jluebeck [at] ucsd.edu -# License: BSD 2-Clause License -# Source: https://github.com/AmpliconSuite/AmpliconSuite-pipeline -# Commit: 0a8a2ff2324b15aab7cb88d310dcc458d06c0bedk - -import argparse -from datetime import datetime -import json -import os -import random -from subprocess import * -import sys -import time -import threading - -PAA_PATH = os.path.dirname(os.path.realpath(__file__)) + "/PrepareAA.py" - - -def generate_individual_seeds(cmd_dict, aa_py, odir, cnv_bed_dict): - individual_seed_dct = {} - print("Generating individual seeds") - for sname, argstring in cmd_dict.items(): - with open(sname + "_CNV_out.txt", "w") as outfile: - cmd = "{} {}{}".format(aa_py, PAA_PATH, argstring) - print(sname) - print(cmd + "\n") - call(cmd, stdout=outfile, stderr=outfile, shell=True) - - # if it was a seeds file, PAA won't modify, so move it into the right location - if sname in cnv_bed_dict and cnv_bed_dict[sname].endswith("AA_CNV_SEEDS.bed"): - cmd = "cp {} {}/".format(cnv_bed_dict[sname], odir) - call(cmd, shell=True) - - # store the name of the path of the seeds file - individual_seed_dct[sname] = "{}/{}_AA_CNV_SEEDS.bed".format(odir, sname) - - return individual_seed_dct - - -def group_seeds(individual_seed_dct, odir): - samplist = list(individual_seed_dct.keys()) - outname = odir + "_".join(samplist[:2]) - if len(samplist) > 2: - outname += "_etc_n" + str(len(samplist)) - - outname += "_merged_AA_CNV_SEEDS.bed" - - bedlist = " ".join(individual_seed_dct.values()) - print("Merging seeds") - cmd = "sort -k1,1 -k2,2n {} | bedtools merge -i - > {}".format(bedlist, outname) - print(cmd) - call(cmd, shell=True) - return outname - - -def launch_AA_AC(jobq, aa_py, PAA_PATH): - try: - sname, arg_string = jobq.pop() - - except IndexError: - return - - with open(sname + "_AA_AC_out.txt", "w") as outfile: - time.sleep(random.uniform(0, 0.75)) - cmd = "{} {}{}".format(aa_py, PAA_PATH, arg_string) - print("\nLaunching AA+AC job for " + sname + "\n" + cmd) - call(cmd, stdout=outfile, stderr=outfile, shell=True) - - -def create_AA_AC_cmds(tumor_lines, base_argstring, grouped_seeds): - cmd_dict = dict() - for tf in tumor_lines: - curr_argstring = "{} --run_AA --run_AC -s {} --bam {} --bed {}".format( - base_argstring, tf[0], tf[1], grouped_seeds - ) - - optionals = zip( - [ - "--sample_metadata", - ], - tf[4:], - ) - for k, v in optionals: - if v: - curr_argstring += " {} {}".format(k, v) - - cmd_dict[tf[0]] = curr_argstring - - return cmd_dict - - -# convert the parsed group input data to PrepareAA commands -def create_CNV_cmds(tumor_lines, normal_lines, base_argstring, cnvkit_dir): - if not normal_lines: - normalbam = None - - else: - normalbam = normal_lines[0] - if len(normal_lines) > 1: - print("More than one normal sample specified. Only the first will be used: " + normalbam[0]) - - cmd_dict = dict() - cnv_bed_dict = dict() - for tf in tumor_lines: - curr_argstring = "{} -s {} --bam {}".format(base_argstring, tf[0], tf[1]) - if normalbam: - curr_argstring += " --normal_bam {}".format(normalbam[1]) - - optionals = zip(["--cnv_bed", "--sample_metadata"], tf[3:]) - for k, v in optionals: - if v: - curr_argstring += " {} {}".format(k, v) - if k == "--cnv_bed": - cnv_bed_dict[tf[0]] = v - - if "--cnv_bed" not in curr_argstring and cnvkit_dir: - curr_argstring += " --cnvkit_dir " + cnvkit_dir - - # if QC is desired it will be done during stage 3 - if "--no_QC" not in curr_argstring: - curr_argstring += " --no_QC" - - cmd_dict[tf[0]] = curr_argstring - - return cmd_dict, cnv_bed_dict - - -def make_base_argstring(arg_dict, stop_at_seeds=False): - base_argstring = "" - for k, v in arg_dict.items(): - if v is True: - if k != "no_AA": - arg = " --" + k - base_argstring += arg - - elif v is not False and not k == "input" and not k == "cnvkit_dir": - arg = " --{} {}".format(k, str(v)) - base_argstring += arg - - return base_argstring - - -# read a file providing the group data -def read_group_data(input_file): - """ - group data is formatted as follows: - sample_name bam_file sample_type - where 'sample_type' is either 'tumor' or 'normal' - additional optional fields are as follows: - cnv_calls sample_metadata_json - """ - tumor_lines = [] - normal_lines = [] - with open(input_file) as infile: - for line in infile: - if line.startswith("#"): - continue - - fields = line.rstrip().rsplit() - if not fields: - continue - - for ind, v in enumerate(fields): - if v.upper() == "NA" or v.upper() == "NONE": - fields[ind] = None - - if fields[2].lower() == "tumor": - tumor_lines.append(fields) - - elif fields[2].lower() == "normal": - normal_lines.append(fields) - - else: - sys.stderr.write( - "Input formatting error! Column 3 must either be 'tumor' or 'normal'.\nSee README for " - "group input formatting instructions.\n\n" - ) - sys.exit(1) - - return tumor_lines, normal_lines - - -def get_argdict(args): - arg_dict = dict() - for arg in vars(args): - value = getattr(args, arg) - if value is not None and value != "": - arg_dict[arg] = value - - return arg_dict - - -# MAIN # -if __name__ == "__main__": - # Parses the command line arguments - parser = argparse.ArgumentParser( - description="A pipeline wrapper for AmpliconArchitect, invoking alignment CNV calling and CNV filtering prior. " - "Can launch AA, as well as downstream amplicon classification." - ) - parser.add_argument( - "-i", - "--input", - help="Input file providing the multi-sample information. See README for " - "information on how to format the input file.", - required=True, - ) - parser.add_argument( - "-o", "--output_directory", help="output directory names (will create if not already created)", required=True - ) - # parser.add_argument("-s", "--sample_name", help="sample name", required=True) - parser.add_argument( - "-t", - "--nthreads", - help="Number of threads to use in BWA, CNV calling and concurrent " "instances of PAA", - type=int, - required=True, - ) - parser.add_argument( - "--no_AA", help="Only produce the union of seeds for the group. Do not run AA/AC", action="store_true" - ) - # parser.add_argument("--run_AA", help="Run AA after all files prepared. Default off.", action='store_true') - # parser.add_argument("--run_AC", help="Run AmpliconClassifier after all files prepared. Default off.", - # action='store_true') - parser.add_argument( - "--ref", - help="Reference genome version.", - choices=["hg19", "GRCh37", "GRCh38", "hg38", "mm10", "GRCm38", "GRCh38_viral"], - ) - parser.add_argument("--cngain", type=float, help="CN gain threshold to consider for AA seeding", default=4.5) - parser.add_argument( - "--cnsize_min", type=int, help="CN interval size (in bp) to consider for AA seeding", default=50000 - ) - parser.add_argument("--downsample", type=float, help="AA downsample argument (see AA documentation)", default=10) - parser.add_argument( - "--use_old_samtools", - help="Indicate you are using an old build of samtools (prior to version " "1.0)", - action="store_true", - default=False, - ) - parser.add_argument( - "--rscript_path", - help="Specify custom path to Rscript, if needed when using CNVKit " "(which requires R version >3.4)", - ) - parser.add_argument("--python3_path", help="If needed, specify a custom path to python3.") - parser.add_argument( - "--aa_python_interpreter", - help="By default PrepareAA will use the system's default python path. If you would like to use " - "a different python version with AA, set this to either the path to the interpreter or " - "'python3' or 'python2'", - type=str, - default="python", - ) - # parser.add_argument("--freebayes_dir", - # help="Path to directory where freebayes executable exists (not the path to the executable " - # "itself). Only needed if using Canvas and freebayes is not installed on system path.") - # parser.add_argument("--vcf", help="VCF (in Canvas format, i.e., \"PASS\" in filter field, AD field as 4th entry of " - # "FORMAT field). When supplied with \"--sorted_bam\", pipeline will start from Canvas CNV stage." - # ) - parser.add_argument("--AA_src", help="Specify a custom $AA_SRC path. Overrides the bash variable") - parser.add_argument( - "--AA_runmode", - help="If --run_AA selected, set the --runmode argument to AA. Default mode is " "'FULL'", - choices=["FULL", "BPGRAPH", "CYCLES", "SVVIEW"], - default="FULL", - ) - parser.add_argument( - "--AA_extendmode", - help="If --run_AA selected, set the --extendmode argument to AA. Default " "mode is 'EXPLORE'", - choices=["EXPLORE", "CLUSTERED", "UNCLUSTERED", "VIRAL"], - default="EXPLORE", - ) - parser.add_argument( - "--AA_insert_sdevs", - help="Number of standard deviations around the insert size. May need to " - "increase for sequencing runs with high variance after insert size " - "selection step. (default 3.0)", - type=float, - default=3.0, - ) - # parser.add_argument("--normal_bam", help="Path to matched normal bam for CNVKit (optional)") - # parser.add_argument("--ploidy", type=float, help="Ploidy estimate for CNVKit (optional). This is not used outside " - # "of CNVKit.", default=None) - # parser.add_argument("--purity", type=float, help="Tumor purity estimate for CNVKit (optional). This is not used " - # "outside of CNVKit.", default=None) - parser.add_argument( - "--cnvkit_segmentation", - help="Segmentation method for CNVKit (if used), defaults to CNVKit " "default segmentation method (cbs).", - choices=["cbs", "haar", "hmm", "hmm-tumor", "hmm-germline", "none"], - default="cbs", - ) - parser.add_argument( - "--no_filter", help="Do not run amplified_intervals.py to identify amplified seeds", action="store_true" - ) - parser.add_argument("--no_QC", help="Skip QC on the BAM file.", action="store_true") - parser.add_argument("--skip_AA_on_normal_bam", help="Skip running AA on the normal bam", action="store_true") - # parser.add_argument("--sample_metadata", help="Path to a JSON of sample metadata to build on") - - # group = parser.add_mutually_exclusive_group(required=True) - # group.add_argument("--sorted_bam", "--bam", help="Coordinate sorted BAM file (aligned to an AA-supported " - # "reference.)") - # group.add_argument("--fastqs", help="Fastq files (r1.fq r2.fq)", nargs=2) - # group.add_argument("--completed_AA_runs", - # help="Path to a directory containing one or more completed AA runs which utilized the same reference genome.") - - # group2 = parser.add_mutually_exclusive_group() - # group2.add_argument("--cnv_bed", "--bed", - # help="BED file (or CNVKit .cns file) of CNV changes. Fields in the bed file should" - # " be: chr start end name cngain") - parser.add_argument( - "--cnvkit_dir", - help="Path to cnvkit.py. Assumes CNVKit is on the system path if not set. " "Not needed if --bed is given.", - ) - # group2.add_argument("--completed_run_metadata", - # help="Run metadata JSON to retroactively assign to collection of samples", default="") - # group2.add_argument("--align_only", help="Only perform the alignment stage (do not run CNV calling and seeding", - # action='store_true') - - args = parser.parse_args() - - if args.output_directory and not args.output_directory.endswith("/"): - args.output_directory += "/" - - if not args.aa_python_interpreter: - args.aa_python_interpreter = "python" - - arg_dict = get_argdict(args) - tumor_lines, normal_lines = read_group_data(args.input) - print("Found {} tumor samples and {} normals\n".format(str(len(tumor_lines)), str(len(normal_lines)))) - - # Stage 1: iterate over and launch each that needs CN calling. collect CN seeds files - base_argstring = make_base_argstring(arg_dict, stop_at_seeds=True) - print("Setting base argstring for Stage 1 as:") - print(base_argstring + "\n") - cmd_dict, cnv_bed_dict = create_CNV_cmds(tumor_lines, normal_lines, base_argstring, args.cnvkit_dir) - individual_seed_dct = generate_individual_seeds( - cmd_dict, args.aa_python_interpreter, args.output_directory, cnv_bed_dict - ) - - # Stage 2: merge seeds (bedtools - gotta sort and merge), and get new args - grouped_seeds = group_seeds(individual_seed_dct, args.output_directory) - - # Stage 3: launch each AA job in parallel - if not args.no_AA: - if args.skip_AA_on_normal_bam: - normal_lines = [] - - all_lines = normal_lines + tumor_lines - cmd_dict = create_AA_AC_cmds(all_lines, base_argstring, grouped_seeds) - threadL = [] - paa_threads = min(args.nthreads, len(all_lines)) - print("\nQueueing " + str(len(all_lines)) + " PAA jobs") - jobq = [] - for i in range(len(all_lines)): - sname = all_lines[i][0] - cmd_string = cmd_dict[sname] - jobq.append((sname, cmd_string)) - - for i in range(paa_threads): - threadL.append(threading.Thread(target=launch_AA_AC, args=(jobq, args.aa_python_interpreter, PAA_PATH))) - # threadL.append(workerThread(i, launch_AA_AC, cmd_string, args.aa_python_interpreter, PAA_PATH, sname)) - threadL[i].start() - - for t in threadL: - t.join() - - print("All jobs completed") diff --git a/bin/PrepareAA.py b/bin/PrepareAA.py deleted file mode 100755 index 28a946b0..00000000 --- a/bin/PrepareAA.py +++ /dev/null @@ -1,1088 +0,0 @@ -#!/usr/bin/env python - -# Author: Jens Luebeck -# Contact: jluebeck [at] ucsd.edu -# License: BSD 2-Clause License -# Source: https://github.com/AmpliconSuite/AmpliconSuite-pipeline -# Commit: 0a8a2ff2324b15aab7cb88d310dcc458d06c0bed - -import argparse -from datetime import datetime -import json -import logging -import os -import socket -from subprocess import * -import sys -import time - -import check_reference -import cnv_prefilter - -__version__ = "0.1537.2" - -PY3_PATH = "python3" # updated by command-line arg if specified -metadata_dict = {} # stores the run metadata (bioinformatic metadata) -sample_info_dict = {} # stores the sample metadata - - -def run_bwa(ref_fasta, fastqs, outdir, sname, nthreads, samtools, usingDeprecatedSamtools=False): - outname = outdir + sname - logging.info("Output prefix: " + outname) - logging.info("Checking for ref index") - exts = [".sa", ".amb", ".ann", ".pac", ".bwt"] - indexPresent = True - for i in exts: - if not os.path.exists(ref_fasta + i): - indexPresent = False - logging.info( - "Could not find " + ref_fasta + i + ", building BWA index from scratch. This could take > 60 minutes" - ) - break - - if not indexPresent: - cmd = "bwa index " + ref_fasta - call(cmd, shell=True) - - print("\nPerforming alignment and sorting") - if usingDeprecatedSamtools: - cmd = "{{ bwa mem -K 10000000 -t {} {} {} | {} view -Shu - | {} sort -m 4G -@4 - {}.cs; }} 2>{}_aln_stage.stderr".format( - nthreads, ref_fasta, fastqs, samtools, samtools, outname, outname - ) - else: - cmd = "{{ bwa mem -K 10000000 -t {} {} {} | {} view -Shu - | {} sort -m 4G -@4 -o {}.cs.bam -; }} 2>{}_aln_stage.stderr".format( - nthreads, ref_fasta, fastqs, samtools, samtools, outname, outname - ) - - logging.info(cmd) - call(cmd, shell=True) - metadata_dict["bwa_cmd"] = cmd - logging.info("\nPerforming duplicate removal & indexing") - cmd_list = [samtools, "rmdup", "-s", "{}.cs.bam".format(outname), "{}.cs.rmdup.bam".format(outname)] - # cmd_list = [samtools, "markdup", "-s", "-@ {}".format(nthreads), "{}.cs.bam".format(outname), {}.cs.rmdup.bam".format(outname)] - - logging.info(" ".join(cmd_list)) - call(cmd_list) - logging.info("\nRunning samtools index") - cmd_list = [samtools, "index", "{}.cs.rmdup.bam".format(outname)] - logging.info(" ".join(cmd_list)) - call(cmd_list) - logging.info("Removing temp BAM") - cmd = "rm {}.cs.bam".format(outname) - call(cmd, shell=True) - return outname + ".cs.rmdup.bam", outname + "_aln_stage.stderr" - - -def run_freebayes(ref, bam_file, outdir, sname, nthreads, regions, fb_path=None): - # Freebayes cmd-line args - # -f is fasta - # -r is region to call - logging.info("Running freebayes...") - fb_exec = "freebayes" - if fb_path: - fb_exec = fb_path + "/" + fb_exec - while True: - try: - curr_region_tup = regions.pop() - except IndexError: - break - - curr_region_string = curr_region_tup[0] + ":" + curr_region_tup[1] - logging.info(curr_region_string + ". " + str(len(regions)) + " items remaining.") - vcf_file = outdir + sname + "_" + curr_region_tup[0] + "_" + curr_region_tup[2] + ".vcf" - replace_filter_field_func = ( - 'awk \'{ if (substr($1,1,1) != "#" ) { $7 = ($7 == "." ? "PASS" : $7 ) }} 1 \' OFS="\\t"' - ) - cmd = "{} --genotype-qualities --standard-filters --use-best-n-alleles 5 --limit-coverage 25000 \ - --strict-vcf -f {} -r {} {} | {} > {}".format( - fb_exec, ref, curr_region_string, bam_file, replace_filter_field_func, vcf_file - ) - logging.info(cmd) - call(cmd, shell=True) - # gzip the new VCF - call("gzip -f " + vcf_file, shell=True) - - -def run_cnvkit(ckpy_path, nthreads, outdir, bamfile, seg_meth="cbs", normal=None, ref_fasta=None, vcf=None): - # CNVkit cmd-line args - # -m wgs: wgs data - # -y: assume chrY present - # -n: create flat reference (cnv baseline) - # -p: number of threads - # -f: reference genome fasta - bamBase = os.path.splitext(os.path.basename(bamfile))[0] - cnvkit_version = Popen([PY3_PATH, ckpy_path, "version"], stdout=PIPE, stderr=PIPE).communicate()[0].rstrip() - try: - cnvkit_version = cnvkit_version.decode("utf-8") - except UnicodeError: - pass - - metadata_dict["cnvkit_version"] = cnvkit_version - - ckRef = AA_REPO + args.ref + "/" + args.ref + "_cnvkit_filtered_ref.cnn" - logging.info("\nRunning CNVKit batch") - if normal: - # create a version of the stripped reference - scripts_dir = os.path.dirname(os.path.abspath(__file__)) + "/scripts/" - strip_cmd = "python {}reduce_fasta.py -r {} -c {} -o {}".format( - scripts_dir, ref_fasta, ref_genome_size_file, outdir - ) - call(strip_cmd, shell=True) - base = os.path.basename(ref_fasta) # args.ref is the name, ref is the fasta - stripRefG = outdir + os.path.splitext(base)[0] + "_reduced" + "".join(os.path.splitext(base)[1:]) - logging.debug("Stripped reference: " + stripRefG) - - cmd = "{} {} batch {} -m wgs --fasta {} -p {} -d {} --normal {}".format( - PY3_PATH, ckpy_path, bamfile, stripRefG, nthreads, outdir, normal - ) - else: - cmd = "{} {} batch -m wgs -r {} -p {} -d {} {}".format(PY3_PATH, ckpy_path, ckRef, nthreads, outdir, bamfile) - - logging.info(cmd) - call(cmd, shell=True) - metadata_dict["cnvkit_cmd"] = cmd + " ; " - rscript_str = "" - if args.rscript_path: - rscript_str = "--rscript-path " + args.rscript_path - logging.info("Set Rscript flag: " + rscript_str) - - cnrFile = outdir + bamBase + ".cnr" - cnsFile = outdir + bamBase + ".cns" - logging.info("\nRunning CNVKit segment") - # TODO: possibly include support for adding VCF calls. - cmd = "{} {} segment {} {} -p {} -m {} -o {}".format( - PY3_PATH, ckpy_path, cnrFile, rscript_str, nthreads, seg_meth, cnsFile - ) - logging.info(cmd) - exit_code = call(cmd, shell=True) - if exit_code != 0: - logging.error("CNVKit encountered a non-zero exit status. Exiting...\n") - sys.exit(1) - - metadata_dict["cnvkit_cmd"] = metadata_dict["cnvkit_cmd"] + cmd - logging.info("\nCleaning up temporary files") - cmd = "rm -f {}/*tmp.bed {}/*.cnn {}/*target.bed {}/*.bintest.cns".format(outdir, outdir, outdir, outdir) - logging.info(cmd) - call(cmd, shell=True) - cmd = "gzip -f " + cnrFile - logging.info(cmd) - call(cmd, shell=True) - if normal: - cmd = "rm " + stripRefG + " " + stripRefG + ".fai" - logging.info(cmd) - call(cmd, shell=True) - - -def merge_and_filter_vcfs(chr_names, vcf_list, outdir, sname): - logging.info("\nMerging VCFs and zipping") - # collect the vcf files to merge - merged_vcf_file = outdir + sname + "_merged.vcf" - relevant_vcfs = [x for x in vcf_list if any([i in x for i in chr_names])] - chrom_vcf_d = {} - for f in relevant_vcfs: - curr_chrom = f.rsplit(".vcf.gz")[0].rsplit("_")[-2:] - chrom_vcf_d[curr_chrom[0] + curr_chrom[1]] = f - - # chr_nums = [x.lstrip("chr") for x in chr_names] - pre_chr_str_names = [str(x) for x in range(1, 23)] + ["X", "Y"] - - # sort the elements - # include the header from the first one - if args.ref != "GRCh37" and args.ref != "GRCm38": - sorted_chr_names = ["chr" + str(x) for x in pre_chr_str_names] - cmd = "zcat " + chrom_vcf_d["chrM"] + """ | awk '$4 != "N"' > """ + merged_vcf_file - - else: - sorted_chr_names = [str(x) for x in pre_chr_str_names] - cmd = "zcat " + chrom_vcf_d["MT"] + """ | awk '$4 != "N"' > """ + merged_vcf_file - - logging.info(cmd) - call(cmd, shell=True) - - # zcat the rest, grepping out all header lines starting with "#" - logging.debug(sorted_chr_names) - for i in sorted_chr_names: - if i == "chrM" or i == "MT": - continue - - cmd_p = "zcat " + chrom_vcf_d[i + "p"] + """ | grep -v "^#" | awk '$4 != "N"' >> """ + merged_vcf_file - cmd_q = "zcat " + chrom_vcf_d[i + "q"] + """ | grep -v "^#" | awk '$4 != "N"' >> """ + merged_vcf_file - logging.info(cmd_p) - call(cmd_p, shell=True) - logging.info(cmd_q) - call(cmd_q, shell=True) - - cmd = "gzip -f " + merged_vcf_file - logging.info(cmd) - call(cmd, shell=True) - - return merged_vcf_file + ".gz" - - -# Read the CNVkit .cns files -def convert_cnvkit_cns_to_bed(cnvkit_output_directory, base, cnsfile=None, rescaled=False, nofilter=False): - if cnsfile is None: - if not rescaled: - cnsfile = cnvkit_output_directory + base + ".cns" - else: - cnsfile = cnvkit_output_directory + base + "_rescaled.cns" - - with open(cnsfile) as infile, open(cnvkit_output_directory + base + "_CNV_CALLS.bed", "w") as outfile: - head = next(infile).rstrip().rsplit("\t") - for line in infile: - fields = line.rstrip().rsplit("\t") - # s, e = int(fields[1]), int(fields[2]) - cn_r = float(fields[4]) - cn = 2 ** (cn_r + 1) - # do not filter on size since amplified_intervals.py will merge small ones. - outline = "\t".join(fields[0:3] + ["CNVkit", str(cn)]) + "\n" - outfile.write(outline) - - return cnvkit_output_directory + base + "_CNV_CALLS.bed" - - -def rescale_cnvkit_calls(ckpy_path, cnvkit_output_directory, base, cnsfile=None, ploidy=None, purity=None): - if purity is None and ploidy is None: - logging.warning("Warning: Rescaling called without --ploidy or --purity. Rescaling will have no effect.") - if cnsfile is None: - cnsfile = cnvkit_output_directory + base + ".cns" - - if purity < 0.4: - logging.warning("WARNING! Rescaling a low purity sample may cause many false-positive seed regions!") - - cmd = "{} {} call {} -m clonal".format(PY3_PATH, ckpy_path, cnsfile) - if purity: - cmd += " --purity " + str(purity) - if ploidy: - cmd += " --ploidy " + str(ploidy) - - cmd += " -o " + cnvkit_output_directory + base + "_rescaled.cns" - logging.info("Rescaling CNVKit calls\n" + cmd) - call(cmd, shell=True) - - -def run_amplified_intervals( - AA_interpreter, CNV_seeds_filename, sorted_bam, output_directory, sname, cngain, cnsize_min -): - logging.info("\nRunning amplified_intervals") - AA_seeds_filename = "{}_AA_CNV_SEEDS".format(output_directory + sname) - cmd = "{} {}/amplified_intervals.py --ref {} --bed {} --bam {} --gain {} --cnsize_min {} --out {}".format( - AA_interpreter, - AA_SRC, - args.ref, - CNV_seeds_filename, - sorted_bam, - str(cngain), - str(cnsize_min), - AA_seeds_filename, - ) - - logging.info(cmd) - exit_code = call(cmd, shell=True) - if exit_code != 0: - logging.error("amplified_intervals.py returned a non-zero exit code. Exiting...\n") - sys.exit(1) - - metadata_dict["amplified_intervals_cmd"] = cmd - return AA_seeds_filename + ".bed" - - -def run_AA( - AA_interpreter, - amplified_interval_bed, - sorted_bam, - AA_outdir, - sname, - downsample, - ref, - runmode, - extendmode, - insert_sdevs, -): - AA_version = ( - Popen([AA_interpreter, AA_SRC + "/AmpliconArchitect.py", "--version"], stdout=PIPE, stderr=PIPE) - .communicate()[1] - .rstrip() - ) - try: - AA_version = AA_version.decode("utf-8") - except UnicodeError: - pass - - metadata_dict["AA_version"] = AA_version - - cmd = "{} {}/AmpliconArchitect.py --ref {} --downsample {} --bed {} --bam {} --runmode {} --extendmode {} --out {}/{}".format( - AA_interpreter, - AA_SRC, - ref, - str(downsample), - amplified_interval_bed, - sorted_bam, - runmode, - extendmode, - AA_outdir, - sname, - ) - if insert_sdevs is not None: - cmd += " --insert_sdevs {}".format(str(insert_sdevs)) - - logging.info(cmd) - aa_exit_code = call(cmd, shell=True) - if aa_exit_code != 0: - logging.error("AmpliconArchitect returned a non-zero exit code. Exiting...\n") - sys.exit(1) - - metadata_dict["AA_cmd"] = cmd - - -def run_AC(AA_outdir, sname, ref, AC_outdir, AC_src): - logging.info("\nRunning AC") - # make input file - class_output = AC_outdir + sname - input_file = class_output + ".input" - bed_dir = class_output + "_classification_bed_files/" - if os.path.exists(bed_dir): - logging.warning( - "WARNING! AC files were not cleared prior to re-running. New classifications may become " - "mixed with previous classification files!" - ) - - cmd = "{}/make_input.sh {} {}".format(AC_src, AA_outdir, class_output) - logging.info(cmd) - call(cmd, shell=True) - - # run AC on input file - with open(input_file) as ifile: - sample_info_dict["number_of_AA_amplicons"] = len(ifile.readlines()) - - cmd = "{} {}/amplicon_classifier.py -i {} --ref {} -o {} --report_complexity".format( - PY3_PATH, AC_src, input_file, ref, class_output - ) - logging.info(cmd) - call(cmd, shell=True) - metadata_dict["AC_cmd"] = cmd - - # Get AC version - AC_version = ( - Popen([PY3_PATH, AC_src + "/amplicon_classifier.py", "--version"], stdout=PIPE, stderr=PIPE) - .communicate()[0] - .rstrip() - ) - try: - AC_version = AC_version.decode("utf-8") - except UnicodeError: - pass - - metadata_dict["AC_version"] = AC_version - - # iterate over the bed files and count anything that isn't "unknown" as a feature - feat_count = 0 - if os.path.exists(bed_dir): - for bf in os.listdir(bed_dir): - if not "unknown" in bf and bf.endswith(".bed"): - feat_count += 1 - - sample_info_dict["number_of_AA_features"] = feat_count - - -def make_AC_table(sname, AC_outdir, AC_src, run_metadata_file, sample_metadata_file, cnv_bed=None): - # make the AC output table - class_output = AC_outdir + sname - input_file = class_output + ".input" - summary_map_file = class_output + "_summary_map.txt" - classification_file = class_output + "_amplicon_classification_profiles.tsv" - cmd = "{} {}/make_results_table.py -i {} --classification_file {} --summary_map {}".format( - PY3_PATH, AC_src, input_file, classification_file, summary_map_file - ) - - if cnv_bed: - cmd += " --cnv_bed " + cnv_bed - - if run_metadata_file: - cmd += " --run_metadata_file " + run_metadata_file - - if sample_metadata_file: - cmd += " --sample_metadata_file " + sample_metadata_file - - logging.info(cmd) - call(cmd, shell=True) - - -def get_ref_sizes(ref_genome_size_file): - chr_sizes = {} - with open(ref_genome_size_file) as infile: - for line in infile: - fields = line.rstrip().rsplit() - if fields: - chr_sizes[fields[0]] = str(int(fields[1]) - 1) - - return chr_sizes - - -def get_ref_centromeres(ref_name): - centromere_dict = {} - fnameD = { - "GRCh38": "GRCh38_centromere.bed", - "GRCh37": "human_g1k_v37_centromere.bed", - "hg19": "hg19_centromere.bed", - "mm10": "mm10_centromere.bed", - "GRCm38": "GRCm38_centromere.bed", - "GRCh38_viral": "GRCh38_centromere.bed", - } - with open(AA_REPO + ref_name + "/" + fnameD[ref_name]) as infile: - for line in infile: - if not "centromere" in line and not "acen" in line: - continue - fields = line.rstrip().rsplit("\t") - if fields[0] not in centromere_dict: - centromere_dict[fields[0]] = (fields[1], fields[2]) - - else: - pmin = min(int(centromere_dict[fields[0]][0]), int(fields[1])) - pmax = max(int(centromere_dict[fields[0]][1]), int(fields[2])) - # pad with 20kb to avoid freebayes issues in calling near centromeres - centromere_dict[fields[0]] = (str(pmin - 20000), str(pmax + 20000)) - - return centromere_dict - - -def save_run_metadata(outdir, sname, args, launchtime, commandstring): - # make a dictionary that stores - # datetime - # hostname - # ref - # PAA command - # AA python interpreter version - # bwa cmd - # CN cmd - # AA cmd - # PAA version - # CNVKit version - # AA version - # AC version - metadata_dict["launch_datetime"] = launchtime - metadata_dict["hostname"] = socket.gethostname() - metadata_dict["ref_genome"] = args.ref - aapint = args.aa_python_interpreter if args.aa_python_interpreter else "python" - aa_python_v = Popen([aapint, "--version"], stdout=PIPE, stderr=PIPE).communicate()[1].rstrip() - try: - aa_python_v = aa_python_v.decode("utf-8") - except UnicodeError: - pass - - metadata_dict["AA_python_version"] = aa_python_v - - metadata_dict["PAA_command"] = commandstring - metadata_dict["PAA_version"] = __version__ - - for x in [ - "bwa_cmd", - "cnvkit_cmd", - "amplified_intervals_cmd", - "AA_cmd", - "AC_cmd", - "cnvkit_version", - "AA_version", - "AC_version", - ]: - if x not in metadata_dict: - metadata_dict[x] = "NA" - - # save the json dict - run_metadata_filename = outdir + sname + "_run_metadata.json" - with open(run_metadata_filename, "w") as fp: - json.dump(metadata_dict, fp, indent=2) - - # sample_info_dict["run_metadata_file"] = run_metadata_filename - return run_metadata_filename - - -def detect_run_failure(align_stderr_file, AA_outdir, sname, AC_outdir): - if align_stderr_file: - cmd = "grep -i error " + align_stderr_file - try: - aln_errs = check_output(cmd, shell=True).decode("utf-8") - - except CalledProcessError: - aln_errs = "" - - if aln_errs: - logging.error("Detected error during bwa mem alignment stage\n") - return True - - if AA_outdir: - sumfile = AA_outdir + sname + "_summary.txt" - if os.path.isfile(sumfile): - namps = -1 - with open(sumfile) as infile: - for line in infile: - if line.startswith("#Amplicons = "): - namps = int(line.rstrip().rsplit(" = ")[-1]) - break - - if namps < 0: - logging.error("Detected truncated or missing AA outputs") - return True - - for x in range(1, namps + 1): - try: - fsize = os.stat(AA_outdir + sname + "_amplicon" + str(x) + "_cycles.txt").st_size - - except OSError: - fsize = 0 - - if fsize == 0: - logging.error("Detected truncated or missing AA outputs") - return True - - else: - logging.error("Detected error during AA stage") - return True - - if AC_outdir: - try: - fsize1 = os.stat(AC_outdir + sname + "_amplicon_classification_profiles.tsv").st_size - fsize2 = os.stat(AC_outdir + sname + "_result_table.tsv").st_size - - except OSError: - fsize1 = 0 - fsize2 = 0 - - if fsize1 == 0 or fsize2 == 0: - logging.error("Detected error during AC stage\n") - return True - - return False - - -# MAIN # -if __name__ == "__main__": - # Parses the command line arguments - parser = argparse.ArgumentParser( - description="A pipeline wrapper for AmpliconArchitect, invoking alignment CNV calling and CNV filtering prior. " - "Can launch AA, as well as downstream amplicon classification." - ) - parser.add_argument("-o", "--output_directory", help="output directory names (will create if not already created)") - parser.add_argument("-s", "--sample_name", help="sample name", required=True) - parser.add_argument("-t", "--nthreads", help="Number of threads to use in BWA and CNV calling", required=True) - parser.add_argument("--run_AA", help="Run AA after all files prepared. Default off.", action="store_true") - parser.add_argument( - "--run_AC", help="Run AmpliconClassifier after all files prepared. Default off.", action="store_true" - ) - parser.add_argument( - "--ref", - help="Reference genome version.", - choices=["hg19", "GRCh37", "GRCh38", "hg38", "mm10", "GRCm38", "GRCh38_viral"], - ) - parser.add_argument("--cngain", type=float, help="CN gain threshold to consider for AA seeding", default=4.5) - parser.add_argument( - "--cnsize_min", type=int, help="CN interval size (in bp) to consider for AA seeding", default=50000 - ) - parser.add_argument("--downsample", type=float, help="AA downsample argument (see AA documentation)", default=10) - parser.add_argument( - "--use_old_samtools", - help="Indicate you are using an old build of samtools (prior to version " "1.0)", - action="store_true", - default=False, - ) - parser.add_argument( - "--rscript_path", - help="Specify custom path to Rscript, if needed when using CNVKit " "(which requires R version >3.4)", - ) - parser.add_argument("--python3_path", help="If needed, specify a custom path to python3.") - parser.add_argument( - "--aa_python_interpreter", - help="By default PrepareAA will use the system's default python path. If you would like to use " - "a different python version with AA, set this to either the path to the interpreter or " - "'python3' or 'python2'", - type=str, - default="python", - ) - # parser.add_argument("--freebayes_dir", - # help="Path to directory where freebayes executable exists (not the path to the executable " - # "itself). Only needed if using Canvas and freebayes is not installed on system path.") - # parser.add_argument("--vcf", help="VCF (in Canvas format, i.e., \"PASS\" in filter field, AD field as 4th entry of " - # "FORMAT field). When supplied with \"--sorted_bam\", pipeline will start from Canvas CNV stage." - # ) - parser.add_argument("--AA_src", help="Specify a custom $AA_SRC path. Overrides the bash variable") - parser.add_argument( - "--AA_runmode", - help="If --run_AA selected, set the --runmode argument to AA. Default mode is " "'FULL'", - choices=["FULL", "BPGRAPH", "CYCLES", "SVVIEW"], - default="FULL", - ) - parser.add_argument( - "--AA_extendmode", - help="If --run_AA selected, set the --extendmode argument to AA. Default " "mode is 'EXPLORE'", - choices=["EXPLORE", "CLUSTERED", "UNCLUSTERED", "VIRAL"], - default="EXPLORE", - ) - parser.add_argument( - "--AA_insert_sdevs", - help="Number of standard deviations around the insert size. May need to " - "increase for sequencing runs with high variance after insert size selection step. (default " - "3.0)", - type=float, - default=None, - ) - parser.add_argument("--normal_bam", help="Path to matched normal bam for CNVKit (optional)") - parser.add_argument( - "--ploidy", - type=float, - help="Ploidy estimate for CNVKit (optional). This is not used outside of CNVKit.", - default=None, - ) - parser.add_argument( - "--purity", - type=float, - help="Tumor purity estimate for CNVKit (optional). This is not used outside of CNVKit.", - default=None, - ) - parser.add_argument( - "--cnvkit_segmentation", - help="Segmentation method for CNVKit (if used), defaults to CNVKit " "default segmentation method (cbs).", - choices=["cbs", "haar", "hmm", "hmm-tumor", "hmm-germline", "none"], - default="cbs", - ) - parser.add_argument( - "--no_filter", help="Do not run amplified_intervals.py to identify amplified seeds", action="store_true" - ) - parser.add_argument( - "--no_QC", - help="Skip QC on the BAM file. Do not adjust AA insert_sdevs for " "poor-quality insert size distribution", - action="store_true", - ) - parser.add_argument("--sample_metadata", help="Path to a JSON of sample metadata to build on") - parser.add_argument( - "-v", "--version", action="version", version="PrepareAA version {version} \n".format(version=__version__) - ) - parser.add_argument( - "--samtools_path", - help="Path to samtools binary (e.g., /path/to/my/samtools). If unset, will use samtools on system path.", - default="", - ) - group = parser.add_mutually_exclusive_group(required=True) - group.add_argument( - "--sorted_bam", "--bam", help="Coordinate sorted BAM file (aligned to an AA-supported " "reference.)" - ) - group.add_argument("--fastqs", help="Fastq files (r1.fq r2.fq)", nargs=2) - group.add_argument( - "--completed_AA_runs", - help="Path to a directory containing one or more completed AA runs which utilized the same reference genome.", - ) - group2 = parser.add_mutually_exclusive_group() - group2.add_argument( - "--cnv_bed", - "--bed", - help="BED file (or CNVKit .cns file) of CNV changes. Fields in the bed file should" - " be: chr start end name cngain", - ) - group2.add_argument( - "--cnvkit_dir", help="Path to cnvkit.py. Assumes CNVKit is on the system path if not set", default="" - ) - group2.add_argument( - "--completed_run_metadata", - help="Run metadata JSON to retroactively assign to collection of samples", - default="", - ) - group2.add_argument( - "--align_only", help="Only perform the alignment stage (do not run CNV calling and seeding", action="store_true" - ) - - # start timing - ta = time.time() - ti = ta - launchtime = str(datetime.now()) - args = parser.parse_args() - - # set an output directory if user did not specify - if not args.output_directory: - args.output_directory = os.getcwd() - - if not args.output_directory.endswith("/"): - args.output_directory += "/" - - sname = args.sample_name - outdir = args.output_directory - sample_metadata_filename = args.output_directory + sname + "_sample_metadata.json" - - # set samtools for use, 20230428 - if not args.samtools_path.endswith("/samtools"): - if args.samtools_path and not args.samtools_path.endswith("/"): - args.samtools_path += "/" - args.samtools_path += "samtools" - - # Make and clear necessary directories. - # make the output directory location if it does not exist - if not os.path.exists(args.output_directory): - os.mkdir(args.output_directory) - - # initiate logging - paa_logfile = args.output_directory + sname + ".log" - logging.basicConfig(filename=paa_logfile, format="[%(name)s:%(levelname)s]\t%(message)s", level=logging.INFO) - logging.getLogger().addHandler(logging.StreamHandler()) - logging.info("Launched on " + launchtime) - logging.info("AmpiconSuite-pipeline version " + __version__ + "\n") - - commandstring = "" - for arg in sys.argv: - if " " in arg: - commandstring += '"{}" '.format(arg) - else: - commandstring += "{} ".format(arg) - - logging.info(commandstring + "\n") - - if "/" in args.sample_name: - logging.error("Sample name -s cannot be a path. Specify output directory with -o.\n") - sys.exit(1) - - finish_flag_filename = args.output_directory + args.sample_name + "_finish_flag.txt" - if os.path.exists(finish_flag_filename): - logging.warning( - "WARNING: Running PrepareAA.py with outputs directed into the same exact output prefix may " - "cause crashes or other unexpected behavior. To avoid errors, clear previous files before " - "re-running.\n" - ) - - with open(finish_flag_filename, "w") as ffof: - ffof.write("UNSUCCESSFUL\n") - - timing_logfile = open(args.output_directory + args.sample_name + "_timing_log.txt", "w") - timing_logfile.write("#stage:\twalltime(seconds)\n") - - # Check if expected system paths and files are present. Check if provided argument combinations are valid. - if args.AA_src: - os.environ["AA_SRC"] = args.AA_src - - # Check if AA_REPO set, print error and quit if not - try: - AA_REPO = os.environ["AA_DATA_REPO"] + "/" - - except KeyError: - logging.error("AA_DATA_REPO bash variable not found. AmpliconArchitect may not be properly installed.\n") - sys.exit(1) - - if not os.path.exists(os.path.join(AA_REPO, "coverage.stats")): - logging.info("coverage.stats file not found in " + AA_REPO + "\nCreating a new coverage.stats file.") - cmd = "touch {}coverage.stats && chmod a+rw {}coverage.stats".format(AA_REPO, AA_REPO) - logging.info(cmd) - call(cmd, shell=True) - - try: - AA_SRC = os.environ["AA_SRC"] - - except KeyError: - logging.error("AA_SRC bash variable not found. AmpliconArchitect may not be properly installed.\n") - sys.exit(1) - - if (args.fastqs or args.completed_AA_runs) and not args.ref: - logging.error("Must specify --ref when providing unaligned fastq files.\n") - sys.exit(1) - - if args.completed_run_metadata.lower() == "none": - args.completed_run_metadata = None - - # if not these args are set, assume cnvkit.py is on the path. - if not (args.cnv_bed or args.cnvkit_dir or args.completed_run_metadata or args.align_only) and ( - args.fastqs or args.sorted_bam - ): - try: - args.cnvkit_dir = str(check_output(["which cnvkit.py"], shell=True).decode("utf-8").rstrip()) - - except CalledProcessError: - logging.error("cnvkit.py not found on system path. Must specify --cnvkit_dir") - sys.exit(1) - - elif args.cnvkit_dir and not args.cnvkit_dir.endswith("/") and not args.cnvkit_dir.endswith("cnvkit.py"): - args.cnvkit_dir += "/" - - else: - args.completed_run_metadata = None - - if not args.cnvkit_dir.endswith("cnvkit.py"): - args.cnvkit_dir += "cnvkit.py" - - if args.run_AA: - if not os.path.exists(os.environ["HOME"] + "/mosek/mosek.lic") and not "MOSEKLM_LICENSE_FILE" in os.environ: - logging.error("--run_AA set, but MOSEK license not found!") - sys.exit(1) - - elif "MOSEKLM_LICENSE_FILE" in os.environ and not os.path.exists( - os.environ["MOSEKLM_LICENSE_FILE"] + "/mosek.lic" - ): - logging.error("--run_AA set, but MOSEK license not found!") - sys.exit(1) - - runCNV = None - if args.cnvkit_dir and not args.cnv_bed: - runCNV = "CNVkit" - # check Rscript version - test_rscript = "Rscript" - if args.rscript_path: - if not args.rscript_path.endswith("/Rscript"): - args.rscript_path += "/Rscript" - - test_rscript = args.rscript_path - - try: - rscript_version_out = str(check_output([test_rscript, "--version"], stderr=STDOUT).decode("utf-8").rstrip()) - - except CalledProcessError: - logging.error(test_rscript + " not found. Must specify --rscript_path") - sys.exit(1) - - if args.python3_path: - if not args.python3_path.endswith("/python") and not args.python3_path.endswith("/python3"): - args.python3_path += "/python3" - - PY3_PATH = args.python3_path - - refFnames = {x: None for x in ["hg19", "GRCh37", "GRCh38", "GRCh38_viral", "mm10"]} - # Paths of all the repo files needed - if args.ref == "hg38": - args.ref = "GRCh38" - if args.ref == "GRCm38": - args.ref = "mm10" - - for rname in refFnames.keys(): - if os.path.exists(AA_REPO + "/" + rname): - refFnames[rname] = check_reference.get_ref_fname(AA_REPO, rname) - - faidict = {} - if args.sorted_bam: - if args.ref and refFnames[args.ref]: - faidict[args.ref] = AA_REPO + args.ref + "/" + refFnames[args.ref] + ".fai" - - elif args.ref and refFnames[args.ref] is None: - em = ( - "Data repo files for ref " + args.ref + " not found. Please download from " - "https://datasets.genepattern.org/?prefix=data/module_support_files/AmpliconArchitect/\n" - ) - logging.error(em) - sys.stderr.write(em) - sys.exit(1) - - else: - for k, v in refFnames.items(): - if v: - faidict[k] = AA_REPO + k + "/" + v + ".fai" - - determined_ref = check_reference.check_ref(args.sorted_bam, faidict, args.samtools_path) - if not determined_ref and not args.ref: - logging.error("Please make sure AA data repo is populated.") - sys.exit(1) - - elif not args.ref: - args.ref = determined_ref - - elif args.ref and not determined_ref: - logging.warning("WARNING! The BAM file did not match " + args.ref) - - gdir = AA_REPO + args.ref + "/" - ref_fasta = gdir + refFnames[args.ref] - ref_genome_size_file = gdir + args.ref + "_noAlt.fa.fai" - removed_regions_bed = gdir + args.ref + "_merged_centromeres_conserved_sorted.bed" - # ploidy_vcf = gdir + "dummy_ploidy.vcf" - if not os.path.isfile(removed_regions_bed): - logging.debug(str(os.listdir(gdir)) + "\n") - logging.error("PrepareAA data repo files not found in AA data repo. Please update your data repo.\n") - sys.exit(1) - - elif args.cnv_bed and not os.path.isfile(args.cnv_bed): - logging.error("Specified CNV bed file does not exist: " + args.cnv_bed + "\n") - sys.exit(1) - - if not args.sample_metadata: - args.sample_metadata = os.path.dirname(os.path.realpath(__file__)) + "/sample_metadata_skeleton.json" - - with open(args.sample_metadata) as input_json: - sample_info_dict = json.load(input_json) - - sample_info_dict["reference_genome"] = args.ref - sample_info_dict["sample_name"] = sname - - tb = time.time() - timing_logfile.write("Initialization:\t" + "{:.2f}".format(tb - ta) + "\n") - ta = tb - logging.info("Running PrepareAA on sample: " + sname) - # Begin PrepareAA pipeline - aln_stage_stderr = None - if args.fastqs: - # Run BWA - fastqs = " ".join(args.fastqs) - logging.info("Will perform alignment on " + fastqs) - args.sorted_bam, aln_stage_stderr = run_bwa( - ref_fasta, fastqs, outdir, sname, args.nthreads, args.samtools_path, args.use_old_samtools - ) - - if not args.completed_AA_runs: - bamBaiNoExt = args.sorted_bam[:-3] + "bai" - cramCraiNoExt = args.sorted_bam[:-4] + "crai" - baiExists = os.path.isfile(args.sorted_bam + ".bai") or os.path.isfile(bamBaiNoExt) - craiExists = os.path.isfile(args.sorted_bam + ".crai") or os.path.isfile(cramCraiNoExt) - if not baiExists and not craiExists: - logging.info(args.sorted_bam + " index not found, calling samtools index") - call([args.samtools_path, "index", args.sorted_bam]) - logging.info("Finished indexing") - - bambase = os.path.splitext(os.path.basename(args.sorted_bam))[0] - prop_paired_proportion = None - if not args.no_QC: - logging.debug("samtools path is set to: " + args.samtools_path) - prop_paired_proportion = check_reference.check_properly_paired(args.sorted_bam, args.samtools_path) - - tb = time.time() - timing_logfile.write("Alignment, indexing and QC:\t" + "{:.2f}".format(tb - ta) + "\n") - - if args.align_only: - logging.info("Completed\n") - tf = time.time() - timing_logfile.write("Total_elapsed_walltime\t" + "{:.2f}".format(tf - ti) + "\n") - timing_logfile.close() - sys.exit() - - ta = tb - centromere_dict = get_ref_centromeres(args.ref) - chr_sizes = get_ref_sizes(ref_genome_size_file) - # coordinate CNV calling - if runCNV == "CNVkit": - cnvkit_output_directory = args.output_directory + sname + "_cnvkit_output/" - if not os.path.exists(cnvkit_output_directory): - os.mkdir(cnvkit_output_directory) - - run_cnvkit( - args.cnvkit_dir, - args.nthreads, - cnvkit_output_directory, - args.sorted_bam, - seg_meth=args.cnvkit_segmentation, - normal=args.normal_bam, - ref_fasta=ref_fasta, - ) - if args.ploidy or args.purity: - rescale_cnvkit_calls( - args.cnvkit_dir, cnvkit_output_directory, bambase, ploidy=args.ploidy, purity=args.purity - ) - rescaling = True - else: - rescaling = False - - args.cnv_bed = convert_cnvkit_cns_to_bed(cnvkit_output_directory, bambase, rescaled=rescaling) - - if args.cnv_bed.endswith(".cns"): - args.cnv_bed = convert_cnvkit_cns_to_bed(outdir, bambase, cnsfile=args.cnv_bed, nofilter=True) - - tb = time.time() - timing_logfile.write("CNV calling:\t" + "{:.2f}".format(tb - ta) + "\n") - ta = tb - - sample_info_dict["sample_cnv_bed"] = args.cnv_bed - - if not args.no_filter and not args.cnv_bed.endswith("_AA_CNV_SEEDS.bed"): - if not args.cnv_bed.endswith("_CNV_CALLS_pre_filtered.bed"): - args.cnv_bed = cnv_prefilter.prefilter_bed( - args.cnv_bed, args.ref, centromere_dict, chr_sizes, args.cngain, args.output_directory - ) - - amplified_interval_bed = run_amplified_intervals( - args.aa_python_interpreter, args.cnv_bed, args.sorted_bam, outdir, sname, args.cngain, args.cnsize_min - ) - - else: - logging.info("Skipping filtering of bed file.") - amplified_interval_bed = args.cnv_bed - - tb = time.time() - timing_logfile.write("Seed filtering (amplified_intervals.py):\t" + "{:.2f}".format(tb - ta) + "\n") - ta = tb - - # Run AA - if args.run_AA: - AA_outdir = outdir + sname + "_AA_results/" - if not os.path.exists(AA_outdir): - os.mkdir(AA_outdir) - - # set the insert sdevs if not given by user. - if ( - not args.no_QC - and not args.AA_insert_sdevs - and prop_paired_proportion is not None - and prop_paired_proportion < 90 - ): - logging.info("Properly paired rate less than 90%, setting --insert_sdevs 9.0 for AA") - args.AA_insert_sdevs = 9.0 - - run_AA( - args.aa_python_interpreter, - amplified_interval_bed, - args.sorted_bam, - AA_outdir, - sname, - args.downsample, - args.ref, - args.AA_runmode, - args.AA_extendmode, - args.AA_insert_sdevs, - ) - tb = time.time() - timing_logfile.write("AmpliconArchitect:\t" + "{:.2f}".format(tb - ta) + "\n") - ta = tb - # Run AC - if args.run_AC: - AC_SRC = os.environ["AC_SRC"] - AC_outdir = outdir + sname + "_classification/" - if not os.path.exists(AC_outdir): - os.mkdir(AC_outdir) - - run_AC(AA_outdir, sname, args.ref, AC_outdir, AC_SRC) - - tb = time.time() - timing_logfile.write("AmpliconClassifier:\t" + "{:.2f}".format(tb - ta) + "\n") - - run_metadata_filename = save_run_metadata(outdir, sname, args, launchtime, commandstring) - - with open(sample_metadata_filename, "w") as fp: - json.dump(sample_info_dict, fp, indent=2) - - if args.run_AA and args.run_AC: - make_AC_table( - sname, - AC_outdir, - AC_SRC, - run_metadata_filename, - sample_metadata_filename, - sample_info_dict["sample_cnv_bed"], - ) - - else: - ta = time.time() - AC_SRC = os.environ["AC_SRC"] - AC_outdir = outdir + sname + "_classification/" - if not os.path.exists(AC_outdir): - os.mkdir(AC_outdir) - - run_AC(args.completed_AA_runs, sname, args.ref, AC_outdir, AC_SRC) - - tb = time.time() - timing_logfile.write("AmpliconClassifier:\t" + "{:.2f}".format(tb - ta) + "\n") - - with open(sample_metadata_filename, "w") as fp: - json.dump(sample_info_dict, fp, indent=2) - - make_AC_table(sname, AC_outdir, AC_SRC, args.completed_run_metadata, sample_metadata_filename) - - if not args.run_AA: - AA_outdir = None - - if not args.run_AC: - AC_outdir = None - - if not detect_run_failure(aln_stage_stderr, AA_outdir, sname, AC_outdir): - logging.info("\nAll stages appear to have completed successfully.") - with open(args.output_directory + args.sample_name + "_finish_flag.txt", "w") as ffof: - ffof.write("All stages completed\n") - - tf = time.time() - timing_logfile.write("Total_elapsed_walltime\t" + "{:.2f}".format(tf - ti) + "\n") - timing_logfile.close() diff --git a/bin/abstract_graph.py b/bin/abstract_graph.py deleted file mode 100755 index 916f82ef..00000000 --- a/bin/abstract_graph.py +++ /dev/null @@ -1,160 +0,0 @@ -#!/usr/bin/env python - -# This software is Copyright 2017 The Regents of the University of California. All Rights Reserved. Permission to copy, modify, and distribute this software and its documentation for educational, research and non-profit purposes, without fee, and without a written agreement is hereby granted, provided that the above copyright notice, this paragraph and the following three paragraphs appear in all copies. Permission to make commercial use of this software may be obtained by contacting: -# -# Office of Innovation and Commercialization -# -# University of California -# -# La Jolla, CA 92093-0910 -# -# (858) 534-5815 -# -# invent@ucsd.edu -# -# This software program and documentation are copyrighted by The Regents of the University of California. The software program and documentation are supplied "as is", without any accompanying services from The Regents. The Regents does not warrant that the operation of the program will be uninterrupted or error-free. The end-user understands that the program was developed for research purposes and is advised not to rely exclusively on the program for any reason. -# -# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. - - -# Author: Viraj Deshpande -# Contact: virajbdeshpande@gmail.com - - -# This file defines classes and methods for an abstract undirected graph, vertex and edge. - - -import logging - - -class abstract_vertex(object): - """Class describing a graph vertex. - Attributes: - elist: List of abstract_edges - vid: (optional) ID for the abstract_vertex - graph: (optional) abstract_graph to which the vertex belongs""" - - def __init__(self, vid=0, graph=None): - """Initiate vertex with optional vid and graph""" - self.elist = [] - self.vid = vid # vertexid - self.graph = graph - if self.vid == 0 and self.graph is not None: - self.vid = self.graph.next_vid() - if self.graph is not None: - if vid in graph.vs: - raise Exception("Adding with duplicate vid") - self.graph.include_vertex(self) - - def neighbors(self): - """Return list of vertices connected to abstract_vertex by a direct edge""" - return [e.v2 for e in self.elist] - - def __hash__(self): - """Return hash based on vid to allow to efficiently check for presence of vid in graph, etc""" - return self.vid - - def __repr__(self): - """Vertex is represented by vid""" - return str(self.vid) - - -class abstract_edge(object): - """Class describing a graph edge. - Attributes: - v1, v2: Ordered pair of vertices connected by the edge - eid: (optional) ID for the abstract_edge - graph: (optional) abstract_graph to which the vertex belongs.""" - - def __init__(self, v1, v2, eid=0, graph=None, update_vertices=True): - """Initiate edge - Arguments: v1, v2, (optional)eid, (optional) graph. - update_vertices: (optional True/False) to update vertices to include edge in v1.elist, v2.elist. (default=True) - """ - self.v1, self.v2 = v1, v2 - self.eid = eid - self.graph = graph - if self.eid == 0 and self.graph is not None: - self.eid = self.graph.next_eid() - if self.graph is not None: - if eid in self.graph.es: - raise Exception("Adding edge with duplicate eid") - self.graph.include_edge(self) - if update_vertices: - if v1.graph is not v2.graph: - raise Exception("Adding edge between vertices of different graphs.") - if graph is not None and v1.graph is not graph: - raise Exception("Edge in different graph than vertex.") - if graph is None and v1.graph is not None: - graph = v1.graph - v1.elist.append(self) - v2.elist.append(self) - - def neighbor(self, v): - """Given a vertex, return its neighbor along the edge""" - if v == self.v1: - return self.v2 - if v == self.v2: - return self.v1 - raise Exception("Edge not connected to vertex") - - def __hash__(self): - """Return hash based on eid to allow to efficiently check for presence of eid in graph, etc""" - return self.eid - - def length(self): - """Not implemented""" - pass - - def __repr__(self): - """String representation of the form v1<->v2.""" - return str(self.v1) + "<->" + str(self.v2) - - -class abstract_graph(object): - """Class describing a graph. - Attributes: - vs: Dictionary from vid/key to vertex - es: Dictionary from eid/key to edge - max_vid: (internal) max_vid, used to assign vid for new vertex. Suggested to use function next_vid. - max_eid: (internal) max_eid, used to assign eid for new edge. Suggested to use function next_eid.""" - - def __init__(self): - """Initiate empty graph""" - self.es = {} # key -->edges - self.vs = {} # key -->vertices - # self.logger = logging.getLogger('Algae') - self.max_eid = 1 - self.max_vid = 1 - - def include_vertex(self, v): - """Include orphan abstract_vertex in graph and update vertex.graph to point to self""" - if v.vid in self.vs and self.vs[v.vid] is not v: - raise "Adding vertex with duplicate vid" - if v.graph is not None and v.graph is not self: - raise "Adding vertex from another graph" - if v.graph is None: - v.graph = self - self.vs[v.vid] = v - - def include_edge(self, e): - """Include orphan abstract_edge in graph and update edge.graph to point to self. Vertices should be updated separately""" - if e.eid in self.es and self.es[e.eid] is not e: - raise "Adding edge with duplicate eid" - if e.graph is not None and e.graph is not self: - raise "Adding edge from another graph" - if e.graph is None: - e.graph = self - self.es[e.eid] = e - - def next_eid(self): - """Find the next eid available for assignment to new edge""" - while self.max_eid in self.es or -1 * self.max_eid in self.es: - self.max_eid += 1 - return self.max_eid - - def next_vid(self): - """Find the next vid available for assignment to new vertex""" - while self.max_vid in self.vs or -1 * self.max_vid in self.vs: - self.max_vid += 1 - return self.max_vid diff --git a/bin/amplified_intervals.py b/bin/amplified_intervals.py deleted file mode 100755 index 3f9da206..00000000 --- a/bin/amplified_intervals.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env python - -# This software is Copyright 2017 The Regents of the University of California. All Rights Reserved. Permission to copy, modify, and distribute this software and its documentation for educational, research and non-profit purposes, without fee, and without a written agreement is hereby granted, provided that the above copyright notice, this paragraph and the following three paragraphs appear in all copies. Permission to make commercial use of this software may be obtained by contacting: -# -# Office of Innovation and Commercialization -# -# University of California -# -# La Jolla, CA 92093-0910 -# -# (858) 534-5815 -# -# invent@ucsd.edu -# -# This software program and documentation are copyrighted by The Regents of the University of California. The software program and documentation are supplied "as is", without any accompanying services from The Regents. The Regents does not warrant that the operation of the program will be uninterrupted or error-free. The end-user understands that the program was developed for research purposes and is advised not to rely exclusively on the program for any reason. -# -# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. - -# Author: Viraj Deshpande -# Contact: virajbdeshpande@gmail.com -# Maintained by Jens Luebeck jluebeck@ucsd.edu -# Source: https://github.com/jluebeck/AmpliconArchitect -# Commit: 2172cdfd5b2834f98f60a5ee77f282249e16f527 - -import argparse -import logging -import os -import sys - -import numpy as np -import pysam - -import global_names - -sys.setrecursionlimit(10000) - -GAIN = 4.5 -CNSIZE_MIN = 50000 - - -parser = argparse.ArgumentParser(description="Filter and merge amplified intervals") -parser.add_argument( - "--bed", - dest="bed", - help="Input bed file with list of amplified intervals", - metavar="FILE", - action="store", - type=str, - required=True, -) -parser.add_argument( - "--out", - dest="out", - help="OPTIONAL: Prefix filename for output bed file. Default: _amplified.bed", - metavar="FILE", - action="store", - type=str, - default="", -) -parser.add_argument( - "--bam", - dest="bam", - help="OPTIONAL: Bamfile, used to avoid large aneuploidies", - metavar="FILE", - action="store", - type=str, - default="", -) -parser.add_argument( - "--gain", - dest="gain", - help="OPTIONAL: CN gain threshold for interval to be considered as a seed. Default: 5", - action="store", - type=float, - default=GAIN, -) -parser.add_argument( - "--cnsize_min", - dest="cnsize_min", - help="OPTIONAL: Minimum size (in bp) for interval to be considered as a seed. Default: 100000", - action="store", - type=int, - default=CNSIZE_MIN, -) -parser.add_argument( - "--ref", - dest="ref", - help='Values: [hg19, GRCh37, GRCh38, GRCh38_viral, mm10, GRCm38]. "hg19", "GRCh38", "mm10" : chr1, .. chrM etc / "GRCh37", "GRCm38" : \'1\', \'2\', .. \'MT\' etc/ "None" : Do not use any annotations. AA can tolerate additional chromosomes not stated but accuracy and annotations may be affected.', - metavar="STR", - action="store", - type=str, - choices=["hg19", "GRCh37", "GRCh38", "GRCh38_viral", "mm10", "GRCm38"], - required=True, -) -parser.add_argument( - "--no_cstats", - dest="no_cstats", - help="Do not re-use coverage statistics from coverage.stats.", - action="store_true", - default=False, -) - -args = parser.parse_args() - -global_names.REF = args.ref -import ref_util as hg - -if args.bed != "": - rdAlts = args.bed - -if args.out != "": - outname = args.out + ".bed" -else: - outname = os.path.splitext(rdAlts)[0] + "_amplified.bed" - -GAIN, CNSIZE_MIN = args.gain, args.cnsize_min - -rdList0 = hg.interval_list(rdAlts, "bed") -if rdList0: - try: - if len(rdList0[0].info) == 0: - logging.error( - "ERROR: CNV estimate bed file had too few columns.\n" "Must contain: chr pos1 pos2 cnv_estimate\n" - ) - sys.exit(1) - - _ = float(rdList0[0].info[-1]) - - except ValueError: - logging.error("ERROR: CNV estimates must be in last column of bed file.\n") - sys.exit(1) - -tempL = [] -for r in rdList0: - if args.ref == "GRCh38_viral" and not r.chrom.endswith("chr"): - tempL.append(r) - - elif float(r.info[-1]) > GAIN: - tempL.append(r) - -rdList = hg.interval_list(tempL) - -# rdList = hg.interval_list([r for r in rdList0 if float(r.info[-1]) > GAIN or (args.ref == "GRCh38_viral" and not r.chrom.endswith("chr"))]) - -if args.bam != "": - import bam_to_breakpoint as b2b - - if os.path.splitext(args.bam)[-1] == ".cram": - bamFile = pysam.Samfile(args.bam, "rc") - else: - bamFile = pysam.Samfile(args.bam, "rb") - - cstats = None - cb = bamFile - if os.path.exists(os.path.join(hg.DATA_REPO, "coverage.stats")) and not args.no_cstats: - coverage_stats_file = open(os.path.join(hg.DATA_REPO, "coverage.stats")) - for l in coverage_stats_file: - ll = l.strip().split() - if not ll: - continue - bamfile_pathname = str(cb.filename.decode()) - if ll[0] == os.path.abspath(bamfile_pathname): - bamfile_filesize = os.path.getsize(bamfile_pathname) - - cstats = tuple(map(float, ll[1:])) - if len(cstats) < 15 or cstats[13] != 3 or bamfile_filesize != int(cstats[14]) or any(np.isnan(cstats)): - cstats = None - - coverage_stats_file.close() - - bamFileb2b = b2b.bam_to_breakpoint(bamFile, coverage_stats=cstats) - pre_int_list = [] - for r in rdList: - try: - chrom_cov_ratio = bamFileb2b.median_coverage(refi=r)[0] / bamFileb2b.median_coverage()[0] - # print("chrom ratio " + r.chrom + " " + str(chrom_cov_ratio)) - if ( - float(r.info[-1]) - > GAIN + 2 * max(1.0, bamFileb2b.median_coverage(refi=r)[0] / bamFileb2b.median_coverage()[0]) - 2 - and bamFileb2b.median_coverage(refi=r)[0] / bamFileb2b.median_coverage()[0] > 0 - ): - if r.size() < 10000000 or float(r.info[-1]) > 1.5 * GAIN: - pre_int_list.append(r) - - elif float(r.info[-1]) > 1 and args.ref == "GRCh38_viral" and not r.chrom.startswith("chr"): - pre_int_list.append(r) - - except ZeroDivisionError: - logging.error("zero division error", r.chrom, args.ref, float(r.info[-1])) - - # if float(r.info[-1]) > 1 and args.ref == "GRCh38_viral" and not r.chrom.startswith("chr"): - # pre_int_list.append(r) - # - continue - - rdList = hg.interval_list(pre_int_list) - -amplicon_listl = rdList - -cr = hg.conserved_regions -uc_list = hg.interval_list([]) -for a in amplicon_listl: - if ( - len(hg.interval_list([a]).intersection(cr)) == 0 - or a.size() - > max(1000000, 10 * sum([a.intersection(ci[1]).size() for ci in hg.interval_list([a]).intersection(cr)])) - or a.size() - sum([a.intersection(ci[1]).size() for ci in hg.interval_list([a]).intersection(cr)]) > 2000000 - ): - if (len(hg.interval_list([a]).intersection(cr))) == 0: - uc_list.append(a) - else: - cra = hg.interval_list([a]).intersection(cr) - cpos = a.start - for crai in cra: - if cpos < crai[1].start - 1000000: - uc_list.append(hg.interval(a.chrom, cpos, crai[1].start - 1000000, info=a.info)) - cpos = crai[1].end + 1000000 - if a.end > cpos: - uc_list.append(hg.interval(a.chrom, cpos, a.end, info=a.info)) - -new_uc_list = [] -for a in uc_list: - if args.ref == "GRCh38_viral" and not a.chrom.startswith("chr"): - if a.rep_content() < 2.5: - new_uc_list.append(a) - else: - if float(a.info[-1]) * a.segdup_uniqueness() > GAIN and a.rep_content() < 2.5: - new_uc_list.append(a) - -uc_merge = hg.interval_list(new_uc_list).merge_clusters(extend=300000) - -with open(outname, "w") as outfile: - for a in uc_merge: - is_viral = False - if args.ref == "GRCh38_viral" and not a[0].chrom.startswith("chr"): - is_viral = True - - if sum([ai.size() for ai in a[1]]) > CNSIZE_MIN or is_viral: - outfile.write( - "\t".join( - [ - str(a[0]), - str(sum([ai.size() * float(ai.info[-1]) for ai in a[1]]) / sum([ai.size() for ai in a[1]])), - rdAlts, - ] - ) - + "\n" - ) diff --git a/bin/bam2bam.py b/bin/bam2bam.py old mode 100644 new mode 100755 diff --git a/bin/bam_to_breakpoint.py b/bin/bam_to_breakpoint.py deleted file mode 100755 index 0f72fe0d..00000000 --- a/bin/bam_to_breakpoint.py +++ /dev/null @@ -1,3682 +0,0 @@ -# This software is Copyright 2017 The Regents of the University of California. All Rights Reserved. Permission to copy, modify, and distribute this software and its documentation for educational, research and non-profit purposes, without fee, and without a written agreement is hereby granted, provided that the above copyright notice, this paragraph and the following three paragraphs appear in all copies. Permission to make commercial use of this software may be obtained by contacting: -# -# Office of Innovation and Commercialization -# -# University of California -# -# La Jolla, CA 92093-0910 -# -# (858) 534-5815 -# -# invent@ucsd.edu -# -# This software program and documentation are copyrighted by The Regents of the University of California. The software program and documentation are supplied "as is", without any accompanying services from The Regents. The Regents does not warrant that the operation of the program will be uninterrupted or error-free. The end-user understands that the program was developed for research purposes and is advised not to rely exclusively on the program for any reason. -# -# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. - -# Author: Viraj Deshpande -# Contact: virajbdeshpande@gmail.com -# Source: https://github.com/jluebeck/AmpliconArchitect -# Commit: 2172cdfd5b2834f98f60a5ee77f282249e16f527 - -import itertools -from time import time -import pysam -import math -import copy -from collections import defaultdict -import mosek_solver -import sys -import numpy as np -from scipy import stats -import heapq -import os -import logging -import bisect -import matplotlib - -matplotlib.use("Agg") -import matplotlib.pyplot as plt -from matplotlib.patches import Ellipse, Rectangle, Arc -import matplotlib.ticker as ticker -from matplotlib import gridspec -import random -import re -from past.builtins import xrange -from functools import reduce - -from breakpoint_graph import * -import ref_util as hg -from mycolors import * -import global_names - - -# use Arial font if you have it. will fall back to default if not available. -matplotlib.rcParams["font.family"] = "sans-serif" -matplotlib.rcParams["font.sans-serif"] = ["Arial"] -matplotlib.rcParams["pdf.fonttype"] = 42 - -summary_logger = logging.getLogger("summary") -graph_logger = logging.getLogger("graph") -cycle_logger = logging.getLogger("cycle") - -# suppress some specific harmless numpy warnings during AA -np.seterr( - divide="ignore", - invalid="ignore", -) -TSTART = global_names.TSTART - - -class breakpoint_cluster: - def __init__(self, edge, bamfile, max_insert): - self.edge = edge - - -class bam_to_breakpoint: - def __init__( - self, - bamfile, - sample_name="", - read_length=100, - max_insert=400, - insert_size=300, - num_sdevs=3, - window_size=10000, - min_coverage=30, - pair_support=-1, - pair_support_min=2, - downsample=-1, - secondary_index=None, - coverage_stats=None, - coverage_windows=None, - sensitivems=False, - span_coverage=True, - tstart=0, - ): - self.bamfile = bamfile - self.sample_name = sample_name - self.window_size = window_size - self.min_coverage = min_coverage - self.max_insert = max_insert - self.insert_size = insert_size - self.num_sdevs = num_sdevs - self.read_length = read_length - self.secondary_index = secondary_index - self.gc_scale = defaultdict(lambda: 1.0) # self.gc_scaling() - self.gc_set = False - self.ms_window_size = 10000 - self.downsample = downsample - self.downsample_ratio = 1 - self.sensitivems = sensitivems - self.span_coverage = span_coverage - self.mapping_quality_cutoff = 5 - self.breakpoint_mapping_quality_cutoff = 20 - self.breakpoint_entropy_cutoff = 0.75 - self.pair_support_min = pair_support_min - hg.update_chrLen([(c["SN"], c["LN"]) for c in self.bamfile.header["SQ"]]) - self.discordant_edge_calls = {} - self.interval_coverage_calls = {} - self.tstart = tstart if tstart != 0 else TSTART - if coverage_stats is None: - self.basic_stats_set = False - self.median_coverage(window_list=coverage_windows) - else: - ( - wc_10000_median, - wc_10000_avg, - wc_10000_std, - wc_300_median, - wc_300_avg, - wc_300_std, - self.read_length, - self.insert_size, - self.insert_std, - self.min_insert, - self.max_insert, - self.pair_support, - self.percent_proper, - _, - _, - ) = coverage_stats - self.basic_stats = coverage_stats - self.basic_stats_set = True - r = coverage_stats - - if self.downsample < 0 or self.downsample > self.basic_stats[0]: - self.downsample_ratio = 1 - elif self.downsample == 0: - self.downsample_ratio = 10.0 / self.basic_stats[0] if self.basic_stats[0] > 10 else 1 - else: - self.downsample_ratio = ( - float(self.downsample) / self.basic_stats[0] if self.basic_stats[0] > float(self.downsample) else 1 - ) - - if self.downsample_ratio != 1: - rr = self.downsample_ratio - rsq = math.sqrt(rr) - r = [i[0] * i[1] for i in zip([rr, rr, rsq, rr, rr, rsq, 1, 1, 1, 1, 1, 1, 1], r)] - r[11] = max((r[4] / 10.0) * ((r[7] - r[6]) / 2 / r[6]) * r[12], 2) - self.pair_support = r[11] - self.downsample_stats = r - else: - self.downsample_stats = self.basic_stats - self.coverage_logs = {} - - if pair_support != -1: - self.pair_support = pair_support - - # Methods to find coverage and other statistics of bam file - - def fetch(self, c, s, e): - if s > e: - (s, e) = (e, s) - if s < 0: - s = 1 - if s > hg.chrLen[hg.chrNum(c)]: - s = hg.chrLen[hg.chrNum(c)] - 1 - e = hg.chrLen[hg.chrNum(c)] - 1 - if e < 0: - s = 1 - e = 1 - if e > hg.chrLen[hg.chrNum(c)]: - e = hg.chrLen[hg.chrNum(c)] - 1 - if self.downsample_ratio == 1: - for a in self.bamfile.fetch(c, s, e + 1): - yield a - else: - for a in self.bamfile.fetch(c, s, e + 1): - random.seed(a.query_name) - if random.uniform(0, 1) < self.downsample_ratio: - yield a - - def interval_coverage(self, i, clip=False, gcc=False): - call_args = (i.chrom, i.start, i.end, clip, gcc) - if call_args in self.interval_coverage_calls: - return self.interval_coverage_calls[call_args] - if gcc: - wc_raw = self.window_coverage(i) - wc_corrected = 0 - j = 0 - for w in wc_raw: - alist = [a for a in self.fetch(w[0].chrom, w[0].start, w[0].end)] - wc_corrected += w[0].size() * w[1] / self.gc_scaling()[int(w[0].gc_content() * 10) / 10.0] - if ( - w[0].size() * w[1] / self.gc_scaling()[int(w[0].gc_content() * 10) / 10.0] - > 10 * len(alist) * self.read_length - ): - print( - str(i).strip(), - str(w[0]).strip(), - wc_corrected, - len(alist), - w[1], - self.gc_scaling()[int(w[0].gc_content() * 10) / 10.0], - w[0].gc_content(), - w[0].sequence(), - ) - j += 1 - if j > 100: - raise ValueError("j>100") - self.interval_coverage_calls[call_args] = wc_corrected / i.size() - return self.interval_coverage_calls[call_args] - s2 = i.start - e2 = i.end - if i.start < 0: - s2 = 0 - if i.end > hg.chrLen[hg.chrNum(i.chrom)]: - e2 = hg.chrLen[hg.chrNum(i.chrom)] - if s2 >= e2: - return 0 - - # if e2 - s2 >= window_size and clip == False and gcc == False: - # return len(alist) * self.read_length / float(e2 - s2) - # sumb = 0 - # if clip == True or (clip == False and i.size() >= 100 * self.read_length): - # return len(alist) * self.read_length / float(i.size()) - if clip == True or (clip is None and e2 - s2 <= 1000): - icc = ( - sum( - [ - sum(a) - for a in self.bamfile.count_coverage( - i.chrom, s2, e2, quality_threshold=self.mapping_quality_cutoff - ) - ] - ) - * self.downsample_ratio - / max(1.0, float(e2 - s2 + 1)) - ) - self.interval_coverage_calls[call_args] = icc - return self.interval_coverage_calls[call_args] - else: - alist_len = len( - [ - a - for a in self.fetch(i.chrom, s2, e2) - if not a.is_unmapped - and a.reference_end - 1 <= e2 - and a.mapping_quality > self.mapping_quality_cutoff - ] - ) - self.interval_coverage_calls[call_args] = alist_len * self.read_length / max(1.0, float(e2 - s2 + 1)) - return self.interval_coverage_calls[call_args] - - # Maintainer found this code block is unreachable - # for a in alist: - # ai = hg.interval(a, bamfile=self.bamfile).intersection(i) - # if ai is not None: - # sumb += ai.size() - # if sumb / float(i.size()) > 10 * len(alist) * self.read_length / float(i.size()): - # print(str(i), sumb, len(alist)) - # raise ValueError("isize exception") - # self.interval_coverage_calls[call_args] = sumb / float(i.size()) - # return self.interval_coverage_calls[call_args] - - def window_coverage_stats(self, i, window_size=-1, gcc=False): - if window_size == -1: - window_size = self.max_insert - self.read_length - j = range(i.start, i.end, window_size) - jj = [hg.interval(i.chrom, k, k + window_size) for k in j] - cc = [self.interval_coverage(k, gcc=gcc) for k in jj] - dd = [abs(cc[j + 1] - cc[j]) for j in range(len(jj) - 1)] - return (sum(cc) / len(cc), sum(dd) / len(dd)) - - def window_coverage(self, i, window_size=-1, gcc=False, clip=None, exact=True): - # print str(i) - if window_size == -1: - window_size = self.max_insert - self.read_length - - def win_breakup(i, window_size): - if exact: - (istart, iend) = (i.start, i.end) - else: - istart = window_size * int(round(float(i.start) / window_size)) - iend = window_size * int(round(float(i.end) / window_size)) - for k in xrange(istart, iend, window_size): - yield hg.interval(i.chrom, k, k + window_size - 1) - - for k in win_breakup(i, window_size): - yield (k, self.interval_coverage(k, gcc=gcc, clip=clip)) - # return [(k, self.interval_coverage(k, gcc=gcc)) for k in jj] - - def median_coverage(self, window_size=-1, gcc=False, refi=-1, window_list=None): - if (window_size == 10000 or window_size == -1) and self.basic_stats_set and refi == -1: - return self.downsample_stats - if window_size == 300 and self.basic_stats_set and refi == -1: - return self.downsample_stats[3:6] - - num_iter = 1000 - iteri = 0 - chroffset = 0 - sumchrLen = sum([l for l in hg.chrLen.values()]) - if refi != -1: - if type(refi) == str: - sumchrLen = hg.chrLen[hg.chrNum(refi)] - chroffset = hg.absPos(refi, 1) - elif type(refi) == hg.interval: - if len([i for i in hg.centromere_list if i.chrom == refi.chrom]) == 0: - chr_cent = None - else: - chr_cent = [i for i in hg.centromere_list if i.chrom == refi.chrom][0] - if chr_cent is None: - sumchrLen = hg.chrLen[hg.chrNum(refi.chrom)] - chroffset = hg.absPos(refi.chrom, 1) - elif chr_cent.end > refi.end and chr_cent.start > refi.start: - sumchrLen = chr_cent.start - chroffset = hg.absPos(refi.chrom, 1) - elif chr_cent.start < refi.start and chr_cent.end < refi.end: - sumchrLen = hg.chrLen[hg.chrNum(refi.chrom)] - chr_cent.end - chroffset = hg.absPos(refi.chrom, 1) + chr_cent.end - else: - sumchrLen = hg.chrLen[hg.chrNum(refi.chrom)] - chroffset = hg.absPos(refi.chrom, 1) - # if hg.chrPos(chroffset) is None: - if refi != -1: - cp = hg.chrPos(chroffset) - if cp is not None: - ii = hg.interval(cp[0], cp[1], cp[1] + sumchrLen) - unconserved_len = sumchrLen - sum( - [i[0].intersection(i[1]).size() for i in hg.interval_list([ii]).intersection(hg.conserved_regions)] - ) - if (sumchrLen < 1000000 or (refi != -1 and unconserved_len < 1000000)) and window_size == -1: - return self.downsample_stats - - elif (sumchrLen < 1000000) and window_size == -1: - return self.downsample_stats - - if (refi != -1 or window_size != -1) and (chroffset, sumchrLen, window_size) in self.coverage_logs: - return self.coverage_logs[(chroffset, sumchrLen, window_size)] - # logging.info("Calculating median arm coverage " + str(refi) + " " + str(window_size)) - - if not self.basic_stats_set: - read_length = [] - insert_size = [] - window_list_index = 0 - non_mapping = 0 - random.seed(global_names.SEED) - while (window_list is not None and window_list_index < len(window_list)) or ( - window_list is None and iteri <= num_iter - ): - if window_list is None: - newpos = int(random.random() * sumchrLen) + chroffset - else: - cwindow = window_list[window_list_index] - window_list_index += 1 - if cwindow.end - cwindow.start < 10000: - continue - newpos = hg.absPos(cwindow.chrom, ((cwindow.end + cwindow.start) / 2) - 5000) - if hg.chrPos(newpos) is None: - logging.debug( - "Unable to locate reference position: " - + refi.chrom - + " " - + str(refi.start) - + " " - + str(refi.end) - + " " - + str(newpos) - + " " - + str(sumchrLen) - ) - iteri += 1 - continue - (c, p) = hg.chrPos(newpos) - if ( - c not in self.bamfile.references - or p < 10000 - or hg.chrLen[hg.chrNum(c)] < p + 10000 - or len( - hg.interval_list([hg.interval(c, p, p + 10000)]).intersection( - hg.conserved_regions, extend=10000 - ) - ) - > 0 - or len( - hg.interval_list([hg.interval(c, p, p + 10000)]).intersection(hg.centromere_list, extend=10000) - ) - > 0 - ): - continue - read_length += [ - a.infer_query_length(always=False) for a in self.fetch(c, p, p + 10000) if not a.is_unmapped - ] - insert_size += [ - a.template_length - for a in self.fetch(c, p, p + 10000) - if a.is_proper_pair and not a.is_reverse and a.template_length < 10000 and a.template_length > 0 - ] - iteri += 1 - self.read_length = np.average(read_length) - self.insert_size = np.average(insert_size) - percent_proper = len(insert_size) * 2.0 / (len(read_length) + non_mapping) - self.percent_proper = percent_proper - self.insert_std = np.std(insert_size) - self.max_insert = self.insert_size + self.num_sdevs * self.insert_std - self.min_insert = max(0, self.insert_size - self.num_sdevs * self.insert_std) - - if window_size not in [-1, 300, 10000]: - ws_list = [window_size] - else: - ws_list = [10000, 300] - - wc_median = [] - wc_avg = [] - wc_std = [] - random.seed(global_names.SEED) - for ws in ws_list: - wc_ws = [] - iteri = 0 - window_list_index = 0 - while (window_list is not None and window_list_index < len(window_list)) or ( - window_list is None and iteri <= num_iter - ): - if window_list is None: - newpos = int(random.random() * sumchrLen) + chroffset - else: - cwindow = window_list[window_list_index] - window_list_index += 1 - if cwindow.end - cwindow.start < 10000: - continue - newpos = hg.absPos(cwindow.chrom, ((cwindow.end + cwindow.start) / 2) - 5000) - if hg.chrPos(newpos) is None: - logging.warning( - "Unable to locate reference position: " - + refi.chrom - + " " - + str(refi.start) - + " " - + str(refi.end) - + " " - + str(newpos) - + " " - + str(sumchrLen) - ) - iteri += 1 - continue - (c, p) = hg.chrPos(newpos) - if ( - c not in self.bamfile.references - or p < ws - or hg.chrLen[hg.chrNum(c)] < p + ws - or len(hg.interval_list([hg.interval(c, p, p + ws)]).intersection(hg.conserved_regions, extend=ws)) - > 0 - or len(hg.interval_list([hg.interval(c, p, p + ws)]).intersection(hg.centromere_list, extend=ws)) - > 0 - ): - continue - wc_ws.append(self.interval_coverage(hg.interval(c, p, p + ws), gcc=gcc)) - iteri += 1 - wc_ws.sort() - wc_ws_median = np.median(wc_ws) - wc_ws_filter = [c for c in wc_ws if c < 5 * wc_ws_median and c > 0] - if len(wc_ws_filter) == 0: - print(len(wc_ws_filter), len(wc_ws), len([c for c in wc_ws if c > 0]), wc_ws_median) - wc_median.append(0) - wc_avg.append(0) - wc_std.append(0) - else: - wc_median.append(wc_ws_filter[len(wc_ws_filter) // 2]) - wc_avg.append(np.average(wc_ws_filter)) - wc_std.append(np.std(wc_ws_filter)) - - if window_size not in [-1, 300, 10000] or refi != -1: - self.coverage_logs[(chroffset, sumchrLen, window_size)] = (wc_median[0], wc_avg[0], wc_std[0]) - return (wc_median[0], wc_avg[0], wc_std[0]) - - (wc_10000_median, wc_10000_avg, wc_10000_std) = (wc_median[0], wc_avg[0], wc_std[0]) - (wc_300_median, wc_300_avg, wc_300_std) = (wc_median[1], wc_avg[1], wc_std[1]) - bamfile_pathname = str(self.bamfile.filename.decode()) - bamfile_filesize = os.path.getsize(bamfile_pathname) - self.pair_support = max( - int( - round( - (wc_300_avg / 10.0) - * ((self.insert_size - self.read_length) / 2 / self.read_length) - * self.percent_proper - ) - ), - self.pair_support_min, - ) - rstats = ( - wc_10000_median, - wc_10000_avg, - wc_10000_std, - wc_300_median, - wc_300_avg, - wc_300_std, - self.read_length, - self.insert_size, - self.insert_std, - self.min_insert, - self.max_insert, - self.pair_support, - self.percent_proper, - self.num_sdevs, - bamfile_filesize, - ) - if refi == -1: - self.basic_stats = rstats - self.basic_stats_set = True - print( - "read length:", - self.read_length, - "insert size:", - self.insert_size, - "insert std dev:", - self.insert_std, - "max_insert:", - self.max_insert, - "percent proper:", - percent_proper, - "num_sdevs", - self.num_sdevs, - ) - print("coverage stats", self.basic_stats, len(wc_ws_filter)) - print("pair support", self.pair_support) - coverage_stats_file = open(hg.DATA_REPO + "/coverage.stats", "a") - coverage_stats_file.write( - os.path.abspath(self.bamfile.filename.decode("utf-8")) + "\t" + "\t".join(map(str, rstats)) + "\n" - ) - coverage_stats_file.close() - - r = rstats - if self.downsample < 0 or self.downsample > self.basic_stats[0]: - self.downsample_ratio = 1 - elif self.downsample == 0: - self.downsample_ratio = 10.0 / self.basic_stats[0] if self.basic_stats[0] > 10 else 1 - else: - self.downsample_ratio = ( - float(self.downsample) / self.basic_stats[0] if self.basic_stats[0] > float(self.downsample) else 1 - ) - if self.downsample_ratio != 1: - rr = self.downsample_ratio - rsq = math.sqrt(rr) - r = [i[0] * i[1] for i in zip([rr, rr, rsq, rr, rr, rsq, 1, 1, 1, 1, 1, 1, 1, 1, 1], r)] - r[11] = max((r[4] / 10.0) * ((r[7] - r[6]) / 2 / r[6]) * r[12], 2) - self.pair_support = r[11] - self.downsample_stats = r - - else: - self.downsample_stats = self.basic_stats - - return rstats - - def gc_scaling(self): - if self.gc_set: - return self.gc_scale - gc_total_rd = {i / 10.0: 0 for i in range(11)} - gc_num_windows = {i / 10.0: 0 for i in range(11)} - for ri in range(len(self.bamfile.references)): - # print self.bamfile.references[ri] - # print hg.chrLen - if hg.chrNum(self.bamfile.references[ri]) not in hg.chrLen: - continue - # print self.bamfile.references[ri] - wc = self.window_coverage(hg.interval(self.bamfile.references[ri], 0, self.bamfile.lengths[ri])) - lwc = 0 - for w in wc: - lwc += 1 - gc_total_rd[int(w[0].gc_content() * 10.0) / 10.0] += w[1] - gc_num_windows[int(w[0].gc_content() * 10.0) / 10.0] += 1 - # print gc_num_windows, gc_total_rd, lwc - break - sc_factor = sum(gc_total_rd.values()) / sum(gc_num_windows.values()) - scale = {} - for i in gc_total_rd: - if gc_num_windows[i] == 0: - scale[i] = 1.0 - else: - scale[i] = gc_total_rd[i] / gc_num_windows[i] / sc_factor - self.gc_scale = scale - self.gc_set = True - logging.debug("GC scale:", scale) - return scale - - # Methods to find all coverage shifts in amplicon - def meanshift(self, i, window_size=-1, hb=2, cov=None, rd_global=-1, h0=-1, gcc=False, n=-1): - if window_size == -1: - window_size = self.max_insert - self.read_length - if rd_global == -1: - rd_global = self.median_coverage(window_size, gcc)[0] - if h0 == -1: - h0 = self.median_coverage(window_size, gcc)[2] - if rd_global == 0: - rd_global = self.median_coverage()[0] - h0 = self.median_coverage()[2] - if n == -1: - n = min(max(100, 10 * hb), 10000000 // window_size) - j = range(len(cov)) - if cov is None: - s2 = i.start - window_size * n - e2 = i.end + window_size * n - if s2 < 0: - s2 = 0 - if e2 > hg.chrLen[hg.chrNum(i.chrom)]: - e2 = hg.chrLen[hg.chrNum(i.chrom)] - j = range(s2, e2, window_size) - # j = range(i.start, i.end, window_size) - # jj = [hg.interval(i.chrom, k, k + window_size) for k in j] - i2 = hg.interval(i.chrom, s2, e2) - cov = [c for c in self.window_coverage(i2, window_size, gcc, exact=False)] - - # cov = [self.interval_coverage(k) for k in jj] - # print window_size, len(cov), str(cov[0][0]).strip(), cov[0][1], str(cov[1][0]).strip(), cov[1][1] - def hr(wi): - if cov[wi][1] < rd_global / 4.0: - return h0 / 2.0 - else: - # return math.sqrt(cov[wi][1] * self.read_length / window_size) - return math.sqrt(cov[wi][1] / rd_global) * h0 - - dfi = [ - ( - cov[wi][0], - sum( - [ - wj - * math.exp(-0.5 * wj**2 / hb**2) - * math.exp(-0.5 * (cov[wi + wj][1] - cov[wi][1]) ** 2 / hr(wi) ** 2) - for wj in range(-1 * n, n + 1) - ] - ), - ) - for wi in range(n, len(j) - n) - if cov[wi][0] is not None - ] - # print 'meanshift', str(i), len(cov), len(dfi), len([c for c in cov if c[0] is not None]), [(str(c[0][0]), c[0][1], c[1][1]) for c in zip([cc for cc in cov if cc[0] is not None], dfi) ] - # [(interval,ms)] - return dfi - - def meanshift_pval(self, s1, s2): - if len(s1) <= 1 and len(s2) <= 1: - return 1.0 - if len(s1) > 1 and len(s2) > 1: - return stats.ttest_ind(s1, s2, equal_var=False)[1] - elif len(s1) == 1: - zscore = abs(s1[0] - np.average(s1 + s2)) / np.std(s1 + s2) - return stats.norm.sf(zscore) - elif len(s2) == 1: - zscore = abs(s2[0] - np.average(s1 + s2)) / np.std(s1 + s2) - return stats.norm.sf(zscore) - return 1.0 - - def meanshift_segmentation(self, i, window_size=-1, gcc=False, pvalue=0.01): - logging.debug("Computing meanshift segmentation on " + str(i)) - if window_size == -1: - window_size = 10000 - i = hg.interval( - i.chrom, - window_size * int(round(float(i.start) / window_size)), - window_size * int(round(float(i.end) / window_size)), - ) - mc = self.median_coverage(window_size, gcc) - rd_global = mc[0] - h0 = mc[2] - hb_profile = [2, 5, 10, 50, 100] - # hb_profile = [2] - n = min(max(100, 10 * hb_profile[-1]), 10000000 // window_size) # number of windows used to calculate meanshift - logging.debug("MS: " + str(i) + " window_size, n: " + str((window_size, n))) - s2 = i.start - window_size * n - e2 = i.end + window_size * n - logging.debug("MS: " + str(i) + " s2, e2: " + str((s2, e2))) - startskip = 0 - endskip = 0 - if s2 < 0: - s2 = i.start % window_size - startskip = n - (i.start - s2) // window_size - if e2 > hg.chrLen[hg.chrNum(i.chrom)]: - hgl = hg.chrLen[hg.chrNum(i.chrom)] - e2 = hgl - (hgl - i.end) % window_size - endskip = n - (hgl - i.end) // window_size - - i2 = hg.interval(i.chrom, s2, e2) - logging.debug("MS: " + str(i) + " startskip,endskip" + str((startskip, endskip))) - cov = [c for c in self.window_coverage(i2, window_size, gcc, exact=False)] - cov = [(None, 0) for ni in range(startskip)] + cov + [(None, 0) for ni in range(endskip)] - frozen = [] - - def hr(c, wlen): - if c < rd_global / 4.0: - return h0 / 2.0 - else: - return math.sqrt(c / rd_global) * h0 - - for hb in hb_profile: - cov2 = copy.copy(cov) - for ms_iterate in range(1): - fi = -1 - if len(frozen) > 0: - fi = 0 - ms = [w for w in self.meanshift(i, window_size, hb, cov2, rd_global=rd_global, h0=h0, gcc=gcc, n=n)] - segs = [] - new_seg = [] - msi = 0 - # print 'THIS0', len(frozen), fi #, frozen[0][0][0].start - # for ff in range(len(frozen)): - # print "THIS", ff, frozen[ff][0][0][0].start - while msi < len(ms): - if fi >= 0 and fi < len(frozen) and ms[msi][0].start == frozen[fi][0][0][0].start: - if len(new_seg) > 0 and (frozen[fi][1] % 2 == 1 or (ms[msi][1] > 0 and ms[msi - 1][1] <= 0)): - segs.append(new_seg) - new_seg = ms[msi : msi + len(frozen[fi][0])] - else: - new_seg += ms[msi : msi + len(frozen[fi][0])] - # segs.append(ms[msi: msi + len(frozen[fi][0])]) - msi += len(frozen[fi][0]) - fi += 1 - continue - elif ms[msi][1] > 0 and ms[msi - 1][1] <= 0 and len(new_seg) > 0: - segs.append(new_seg) - new_seg = [] - new_seg.append(ms[msi]) - msi += 1 - if len(new_seg) > 0: - segs.append(new_seg) - cov2 = copy.copy(cov[:n]) - covi = n - for msi in range(len(segs)): - s = segs[msi] - c = np.average([cc[1] for cc in cov[covi : covi + len(s)]]) - cov2 += [(ss[0], c) for ss in segs[msi]] - covi += len(segs[msi]) - cov2 += cov[-n:] - ci = n - frozen = [] - cpi = n - for si in range(len(segs)): - c = cov2[ci][1] - # c0 = cov[ci][1] - # lseg = segs[si][-1][0].end - segs[si][0][0].start - freeze = 0 - # if segs[si][0][0].start < 54857402 and segs[si][-1][0].end > 54857402: - # print (segs[si][0][0].start, segs[si][-1][0].end), (segs[si-1][0][0].start, segs[si-1][-1][0].end), (segs[si+1][0][0].start, segs[si+1][-1][0].end) - # print stats.ttest_ind([cc[1] for cc in cov[ci:ci + len(segs[si])]], [cs[1] for cs in cov[ci - len (segs[si - 1]):ci]], equal_var=False) - # print abs(cp - c), 3 * math.sqrt(max(cp, c) / rd_global) * h0 - # print abs(cn - c), 3 * math.sqrt(max(cn, c) / rd_global) * h0 - # print [cs[1] for cs in cov[ci - len (segs[si - 1]):ci]] - # print [cc[1] for cc in cov[ci:ci + len(segs[si])]] - # print [cs[1] for cs in cov[ci + len (segs[si]):ci + len(segs[si]) + len(segs[si + 1])]] - if si > 0: - if len(segs[si]) < 15 or len(segs[si - 1]) < 15: - cp = cov2[ci - 1][1] - if abs(cp - c) > 3 * math.sqrt(max(cp, c) / rd_global) * h0: - # if abs(cp - c) > 2 * hr(c, window_size * len(segs[si])): - freeze |= 1 - if len(segs[si]) > 1 and len(segs[si - 1]) > 1: - if ( - self.meanshift_pval( - [cc[1] for cc in cov[ci : ci + len(segs[si])]], - [cs[1] for cs in cov[ci - len(segs[si - 1]) : ci]], - ) - < pvalue - ): - freeze |= 1 - if si < len(segs) - 1: - if len(segs[si]) < 15 or len(segs[si + 1]) < 15: - cn = cov2[ci + len(segs[si])][1] - if abs(cn - c) > 3 * math.sqrt(max(cn, c) / rd_global) * h0: - # if abs(cn - c) > 2 * hr(c, window_size * len(segs[si])): - freeze |= 2 - if ( - self.meanshift_pval( - [cc[1] for cc in cov[ci : ci + len(segs[si])]], - [cs[1] for cs in cov[ci + len(segs[si]) : ci + len(segs[si]) + len(segs[si + 1])]], - ) - < pvalue - ): - freeze |= 2 - # if freeze > 0: - frozen.append((segs[si], freeze, c, cov2[cpi : ci + len(segs[si])], cov[cpi : ci + len(segs[si])])) - ci += len(segs[si]) - if freeze > 0: - cpi = ci - # for f in frozen: - # print str(hg.interval(f[0][0][0].chrom, f[0][0][0].start, f[0][-1][0].end)), f[1], f[2], str(i) - # print '----...-------------...------------...-------------...-----------------...----------------------' - # (list of windows[(windowinterval,ms)], left%2/right%4freeze, avg_coverage) - - plist = [] - ms1list = [] - ms2list = [] - cms = [] - c1list = [] - c2list = [] - for msi in range(len(frozen)): - cms.append(frozen[msi]) - if frozen[msi][1] % 4 >= 2: - plist.append(frozen[msi][0][-1][0].end) - avgc = np.average(reduce(lambda x, y: x + y, [[c[1] for c in cc[4]] for cc in cms], [])) - ms1list.append(avgc * 2 / self.median_coverage(window_size, gcc)[0]) - c1list.append(reduce(lambda x, y: x + y, [[c[1] for c in cc[4]] for cc in cms], [])) - if len(ms1list) > 1: - ms2list.append(avgc * 2 / self.median_coverage(window_size, gcc)[0]) - c2list.append((reduce(lambda x, y: x + y, [[c[1] for c in cc[4]] for cc in cms], []))) - cms = [] - if len(cms) > 0: - avgc = np.average(reduce(lambda x, y: x + y, [[c[1] for c in cc[4]] for cc in cms], [])) - ms2list.append(avgc * 2 / self.median_coverage(window_size, gcc)[0]) - c2list.append((reduce(lambda x, y: x + y, [[c[1] for c in cc[4]] for cc in cms], []))) - - shifts = list(zip(plist, ms1list, ms2list, c1list, c2list)) - - # for a in shifts: - # print a[0], a[1], a[2], len(a[3]), len(a[4]), str(i) - # print '---------------------------------------------------------' - if len(shifts) > 0: - merge = True - else: - merge = False - while merge: - shifts = list(shifts) - merge = False - mergelist = [] - for shiftsi in range(len(shifts)): - s3 = [shifts[shiftsi][3], shifts[shiftsi][3][1:], shifts[shiftsi][3][:-1], shifts[shiftsi][3][1:-1]] - s4 = [shifts[shiftsi][4], shifts[shiftsi][4][1:], shifts[shiftsi][4][:-1], shifts[shiftsi][4][1:-1]] - min_ttest_val = 1.0 - for s3i in s3: - for s4i in s4: - p = self.meanshift_pval(s3i, s4i) - min_ttest_val = min(min_ttest_val, p) - if min_ttest_val > pvalue: - mergelist.append(shiftsi) - if len(mergelist) > 0: - merge = True - plist = [] - ms1list = [] - ms2list = [] - c1list = [] - c2list = [] - c1 = [] - for shiftsi in range(len(shifts)): - c1.extend(shifts[shiftsi][3]) - if shiftsi not in mergelist: - plist.append(shifts[shiftsi][0]) - avgc = np.average(c1) - ms1list.append(avgc * 2 / self.median_coverage(window_size, gcc)[0]) - c1list.append(c1) - if len(plist) > 1: - c2list.append(c1) - ms2list.append(avgc * 2 / self.median_coverage(window_size, gcc)[0]) - c1 = [] - if len(plist) > 0: - c1.extend(shifts[-1][4]) - avgc = np.average(c1) - ms2list.append(avgc * 2 / self.median_coverage(window_size, gcc)[0]) - c2list.append(c1) - shifts = zip(plist, ms1list, ms2list, c1list, c2list) - # for a in shifts: - # print a[0], a[1], a[2], len(a[3]), len(a[4]) - # print '---------------------------------------------------------' - if self.sensitivems: - shifts_select = [s for s in shifts if abs(s[2] - s[1]) >= 1] - else: - shifts_select = [ - s for s in shifts if abs(s[2] - s[1]) >= max(1, min(max(s[2], s[1]) / 10.0, math.sqrt(max(s[2], s[1])))) - ] - if len(shifts_select) == 0: - return hg.interval_list( - [ - hg.interval( - i.chrom, - i.start, - i.end, - info={ - "cn": np.average([c[1] for c in cov[n:-n]]) * 2 / self.median_coverage(window_size, gcc)[0] - }, - ) - ] - ) - else: - shift_intervals = hg.interval_list([]) - start = i.start - for si in shifts_select: - shift_intervals.append(hg.interval(i.chrom, start, si[0], info={"cn": si[1]})) - start = si[0] + 1 - shift_intervals.append(hg.interval(i.chrom, start, i.end, info={"cn": shifts_select[-1][2]})) - return shift_intervals - - def meanshift_refined(self, i, window_size0=10000, window_size1=300, gcc=False, shifts_unrefined=None): - logging.debug("Meanshift refining " + i.chrom + ":" + str(i.start) + "-" + str(i.end)) - if hg.chrLen[hg.chrNum(i.chrom)] < 3 * window_size0: - logging.debug("small chrom") - ms_ws1 = self.meanshift_segmentation(i, window_size1, gcc) - for ii in ms_ws1: - ii.info["start_refined"] = True - ii.info["end_refined"] = True - logging.debug(str((ii.start, ii.end, ii.info["cn"]))) - return ms_ws1 - if shifts_unrefined is None: - shifts0 = self.meanshift_segmentation(i, window_size0, gcc, pvalue=0.0027) - else: - shifts0 = shifts_unrefined - - shift1_intervals = hg.interval_list(hg.interval(msi.chrom, msi.end, msi.end) for msi in shifts0[:-1]) - shift1_intervals = [msi[0] for msi in shift1_intervals.merge_clusters(extend=3 * window_size0)] - shifts1 = reduce( - lambda x, y: x + y, - [ - self.meanshift_segmentation( - hg.interval(i.chrom, s.start - 3 * window_size0, s.start + 3 * window_size0), - window_size1, - gcc, - pvalue=0.05, - ) - for s in shift1_intervals - ], - [], - ) - - matched_shifts = [] - prev_end = None - for s0i in range(len(shifts0[:-1])): - cndiff0 = shifts0[s0i + 1].info["cn"] - shifts0[s0i].info["cn"] - bests1i = None - bestscore = 0 - for s1i in range(len(shifts1) - 1): - if shifts1[s1i].end < i.start or shifts1[s1i].end >= i.end: - continue - if abs(shifts0[s0i].end - shifts1[s1i].end) >= window_size0: - continue - cndiff1 = shifts1[s1i + 1].info["cn"] - shifts1[s1i].info["cn"] - if cndiff0 * cndiff1 < 0 or cndiff0 / cndiff1 <= 0.5 and cndiff0 / cndiff1 >= 2: - continue - if bests1i is None: - bests1i = s1i - bestscore = abs(cndiff0 - cndiff1) - elif abs(cndiff0 - cndiff1) < bestscore: - bestscore = abs(cndiff0 - cndiff1) - bests1i = s1i - best_start = prev_end + 1 if prev_end is not None else shifts0[s0i].start - best_end = shifts1[bests1i].end if bests1i is not None else shifts0[s0i].end - matched_shifts.append( - hg.interval( - i.chrom, - best_start, - best_end, - info={ - "cn": shifts0[s0i].info["cn"], - "start_refined": prev_end is not None, - "end_refined": bests1i is not None, - }, - ) - ) - prev_end = shifts1[bests1i].end if bests1i is not None else None - if len(shifts0) > 1: - s0i = -1 - best_start = prev_end + 1 if prev_end is not None else shifts0[s0i].start - best_end = shifts0[s0i].end - matched_shifts.append( - hg.interval( - i.chrom, - best_start, - best_end, - info={"cn": shifts0[s0i].info["cn"], "start_refined": prev_end is not None, "end_refined": False}, - ) - ) - else: - matched_shifts.append( - hg.interval( - i.chrom, - i.start, - i.end, - info={"cn": shifts0[0].info["cn"], "start_refined": False, "end_refined": False}, - ) - ) - - for ii in matched_shifts: - logging.debug(str((ii.start, ii.end, ii.info["cn"]))) - - return matched_shifts - - def get_meanshift(self, i, window_size0=10000, window_size1=300, gcc=False): - logging.debug("get_meanshift on " + str(i)) - file_name = "%s_%s_%s_%s_cnseg.txt" % (self.sample_name, i.chrom, i.start, i.end) - if os.path.exists(file_name) and i.end - i.start > 50000: - logging.debug("Re-using cn-seg info in " + file_name) - msfile = open(file_name) - msr = [] - for line in msfile: - if len(line) == 0 or line[0] == "#": - continue - ll = line.strip().split() - msi = hg.interval( - str(ll[0]), - int(ll[1]), - int(ll[2]), - info={"cn": float(ll[3]), "start_refined": bool(ll[4]), "end_refined": bool(ll[5])}, - ) - msr.append(msi) - else: - msr = self.meanshift_refined(i, window_size0=window_size0, window_size1=window_size1, gcc=gcc) - msfile = open(file_name, "w") - msfile.write("#chrom\tstart\tend\tcn\tstart_refined\tend_refined\n") - for ms in msr: - msfile.write( - "%s\t%s\t%s\t%s\t%s\t%s\n" - % (ms.chrom, ms.start, ms.end, ms.info["cn"], ms.info["start_refined"], ms.info["end_refined"]) - ) - msfile.close() - return msr - - def interval_crossing_arcs(self, chrom, start, end, strand, ilist): - if strand == -1: - return [ - a - for a in self.fetch(chrom, max(0, start), min(end, hg.chrLen[hg.chrNum(chrom)])) - if not a.is_unmapped - and a.is_reverse - and ( - a.mate_is_unmapped - or a.next_reference_id == -1 - or len( - ilist.intersection( - [hg.interval(a.next_reference_name, a.next_reference_start, a.next_reference_start)] - ) - ) - == 0 - ) - ] - else: - return [ - a - for a in self.fetch(chrom, max(0, start), min(end, hg.chrLen[hg.chrNum(chrom)])) - if not a.is_unmapped - and not a.is_reverse - and ( - a.mate_is_unmapped - or a.next_reference_id == -1 - or len( - ilist.intersection( - [hg.interval(a.next_reference_name, a.next_reference_start, a.next_reference_start)] - ) - ) - == 0 - ) - ] - - # Methods to find breakpoint edges in amplicon - def get_mates(self, a): - gmt = time() - self.get_mates_num_calls += 1 - try: - miter = self.secondary_index.find(a.query_name) - retval = [m for m in miter if m.is_read1 != a.is_read1] - self.get_mates_time += time() - gmt - return retval - except: - # print time(), 'get_mates', str(a) - retval = [ - a2 - for a2 in self.fetch(a.next_reference_name, a.next_reference_start, a.next_reference_start + 1) - if a2.query_name == a.query_name and a2.is_read1 != a.is_read1 - ] - # retval = [self.bamfile.mate(a)] - # print time(), 'got_mates' - self.get_mates_time += time() - gmt - return retval - - def pair_support_count(self, chrom, position, strand, meanshift, foldup=False, sensitivems=True): - # str(hg.interval(f[0][0][0].chrom, f[0][0][0].start, f[0][-1][0].end)), f[1], f[2] - cd = 1 - for fi in range(len(meanshift)): - f = meanshift[fi] - if len(f) == 0: - continue - if not hg.interval(f[0].chrom, f[0].start, f[-1].end).intersects( - hg.interval(chrom, position, position), extend=self.ms_window_size - ): - continue - for pi in range(len(f)): - if f[pi].start + self.ms_window_size >= position: - break - # pi = bisect.bisect_left(f[1], (position,)) - if ( - pi > 0 - and pi < len(f) - and f[pi].start - self.ms_window_size <= position - and (f[pi].info["cn"] - f[pi - 1].info["cn"]) / strand > 0 - ): - cd = abs(f[pi].info["cn"] - f[pi - 1].info["cn"]) - elif pi > 0: - cd = f[pi - 1].info["cn"] - else: - cd = f[0].info["cn"] - mc = self.median_coverage() - cd = max(1, cd) - if self.sensitivems and sensitivems: - cd = min(cd, 10) - pcount = max(mc[4] * cd / 20.0 * ((self.insert_size - self.read_length) / 2 / self.read_length) * mc[12], 2) - pmincount = mc[11] - if pcount < mc[11]: - pcount = pmincount - return pcount - - def concordant_edge(self, v, bp_margin=0): - if v.pos == 0: - return (None, []) - elif v.strand == 1: - dlist = [ - a - for a in self.fetch(v.chrom, max(1, v.pos - self.max_insert), v.pos) - if not a.is_unmapped - and not a.is_reverse - and a.is_proper_pair - and a.next_reference_name == v.chrom - and a.next_reference_start >= v.pos - and a.reference_start < v.pos - bp_margin - and a.next_reference_start < a.reference_start + self.max_insert - self.read_length - ] - if len(dlist) > self.pair_support: - v2 = breakpoint_vertex(v.chrom, max(v.pos + 1, min([a.next_reference_start for a in dlist])), -1) - logging.debug( - "#TIME " + "%.3f\t" % (time() - TSTART) + " concordant edges " + str(v) + " " + str(len(dlist)) - ) - return (breakpoint_edge(v, v2), dlist) - else: - dlist = [ - a - for a in self.fetch(v.chrom, max(1, v.pos - self.max_insert), v.pos) - if not a.is_reverse - and a.is_proper_pair - and not a.is_unmapped - and a.next_reference_name == v.chrom - and a.next_reference_start >= v.pos - and a.reference_start < v.pos - bp_margin - and a.next_reference_start < a.reference_start + self.max_insert - self.read_length - ] - if len(dlist) > self.pair_support: - v2 = breakpoint_vertex(v.chrom, min(v.pos - 1, max([a.reference_end - 1 for a in dlist])), 1) - logging.debug( - "#TIME " + "%.3f\t" % (time() - TSTART) + " concordant edges " + str(v) + " " + str(len(dlist)) - ) - return (breakpoint_edge(v, v2), dlist) - logging.debug("#TIME " + "%.3f\t" % (time() - TSTART) + " concordant edges " + str(v) + " not found") - return (None, dlist) - - def foldup_count(self, chrom, position, strand, cdiff=-1): - interval = hg.interval( - chrom, - max(1, position - self.ms_window_size), - min(hg.chrLen[hg.chrNum(chrom)], position + self.ms_window_size), - ) - if strand == 1: - dlist = [ - a - for a in self.fetch(interval.chrom, interval.start, interval.end) - if not a.is_unmapped - and not a.is_reverse - and a.is_paired - and not a.is_proper_pair - and not a.mate_is_unmapped - and not a.mate_is_reverse - and a.reference_name == a.next_reference_name - and abs(a.next_reference_start - a.reference_start) < 100000 - ] # self.ms_window_size] - else: - dlist = [ - a - for a in self.fetch(interval.chrom, interval.start, interval.end) - if not a.is_unmapped - and a.is_reverse - and a.is_paired - and not a.is_proper_pair - and not a.mate_is_unmapped - and a.mate_is_reverse - and a.reference_name == a.next_reference_name - and abs(a.next_reference_start - a.reference_start) < 100000 - ] # self.ms_window_size] - return len(dlist) - - def refine_discordant_edge(self, e): - # logging.debug("#TIME " + '%.3f\t'%(time() - TSTART) + " refine discordant edge " + str(e)) - v1min = max(0, (e.v1.pos - self.max_insert + self.read_length if e.v1.strand == 1 else e.v1.pos) - 1) - v2min = max(0, (e.v2.pos - self.max_insert + self.read_length if e.v2.strand == 1 else e.v2.pos) - 1) - v1max = ( - min( - e.v1.pos + self.max_insert - self.read_length if e.v1.strand == -1 else e.v1.pos, - hg.chrLen[hg.chrNum(e.v1.chrom)], - ) - - 1 - ) - v2max = ( - min( - e.v2.pos + self.max_insert - self.read_length if e.v2.strand == -1 else e.v2.pos, - hg.chrLen[hg.chrNum(e.v2.chrom)], - ) - - 1 - ) - d1list = [a for a in self.fetch(e.v1.chrom, v1min, v1max) if not a.is_unmapped] - d2list = [a for a in self.fetch(e.v2.chrom, v2min, v2max) if not a.is_unmapped] - d1Set = set([(a.query_name, a.is_read1, a.is_reverse, a.is_secondary) for a in d1list]) - if e.v1.strand == e.v2.strand: - d2Set = set([(a.query_name, a.is_read1, not a.is_reverse, not a.is_secondary) for a in d2list]) - else: - d2Set = set([(a.query_name, a.is_read1, a.is_reverse, not a.is_secondary) for a in d2list]) - rSet = d1Set.intersection(d2Set) - if len(rSet) == 0: - return (e, 0, [], None) - multi_r = set() - d1reads = {} - d2reads = {} - for a in d1list: - if (a.query_name, a.is_read1, a.is_reverse, a.is_secondary) in d1reads: - multi_r.add((a.query_name, a.is_read1, a.is_reverse, a.is_secondary)) - d1reads[(a.query_name, a.is_read1, a.is_reverse, a.is_secondary)] = a - if e.v1.strand == e.v2.strand: - for a in d2list: - if (a.query_name, a.is_read1, not a.is_reverse, not a.is_secondary) in d2reads: - multi_r.add((a.query_name, a.is_read1, not a.is_reverse, not a.is_secondary)) - d2reads[(a.query_name, a.is_read1, not a.is_reverse, not a.is_secondary)] = a - else: - for a in d2list: - if (a.query_name, a.is_read1, a.is_reverse, not a.is_secondary) in d2reads: - multi_r.add((a.query_name, a.is_read1, a.is_reverse, not a.is_secondary)) - d2reads[(a.query_name, a.is_read1, a.is_reverse, not a.is_secondary)] = a - - dpairs = defaultdict(lambda: [], {}) - for aa in rSet: - if a.query_name in multi_r: - continue - a1 = d1reads[aa] - a2 = d2reads[aa] - a1clip_prefix = 0 - for a1c in a1.cigartuples: - if a1c[0] == 5: - a1clip_prefix += a1c[1] - else: - break - a2clip_prefix = 0 - for a2c in a2.cigartuples: - if a2c[0] == 5: - a2clip_prefix += a2c[1] - else: - break - a1clip_suffix = 0 - for a1c in a1.cigartuples[::-1]: - if a1c[0] == 5: - a1clip_suffix += a1c[1] - else: - break - a2clip_suffix = 0 - for a2c in a2.cigartuples[::-1]: - if a2c[0] == 5: - a2clip_suffix += a2c[1] - else: - break - - if a1.is_reverse: - r1 = ( - a1.infer_query_length() + a1clip_suffix - a1.query_alignment_end, - a1.infer_query_length() + a1clip_suffix - a1.query_alignment_start - 1, - ) - else: - r1 = (a1.query_alignment_start + a1clip_prefix, a1.query_alignment_end - 1 + a1clip_prefix) - if a2.is_reverse: - r2 = ( - a2.infer_query_length() + a2clip_suffix - a2.query_alignment_end, - a2.infer_query_length() + a2clip_suffix - a2.query_alignment_start - 1, - ) - else: - r2 = (a2.query_alignment_start + a2clip_prefix, a2.query_alignment_end - 1 + a2clip_prefix) - - if r1[0] <= r2[0] and r1[1] <= r2[1]: - hom = r1[1] - r2[0] + 1 - prefix = True - elif r1[0] >= r2[0] and r1[1] >= r2[1]: - hom = r2[1] - r1[0] + 1 - prefix = False - else: - continue - - if ((e.v1.strand == 1) == (not a1.is_reverse)) != prefix: - continue - - if hom > 0: - # p1 = a1.reference_end - hom - 1 if e.v1.strand == 1 else a1.reference_start + hom - # p2 = a2.reference_end - hom - 1 if e.v2.strand == 1 else a2.reference_start + hom - p1 = a1.reference_end - 1 if e.v1.strand == 1 else a1.reference_start - p2 = a2.reference_end - 1 if e.v2.strand == 1 else a2.reference_start - else: - p1 = a1.reference_end - 1 if e.v1.strand == 1 else a1.reference_start - p2 = a2.reference_end - 1 if e.v2.strand == 1 else a2.reference_start - if (e.v1.chrom, e.v1.pos, e.v1.strand) != (e.v2.chrom, e.v2.pos, e.v2.strand): - dpairs[(hom, p1, p2)].append((a1, a2, r1, r2)) - elif p1 >= p2: - dpairs[(hom, p1, p2)].append((a1, a2, r1, r2)) - - if len(dpairs) == 0: - return (e, 0, [], None) - max_s = max([len(s) for s in dpairs.values()]) - max_p = [p for p in dpairs.keys() if len(dpairs[p]) == max_s] - - if len(max_p) != 1: - logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + " refine discordant edge max_p not 1 " - + str(e) - + " " - + str(max_p) - ) - return (e, 0, [], None) - hom = max_p[0][0] - hom_seq = "" - if dpairs[max_p[0]][0][0].is_secondary: - vstrand = e.v2.strand - a = dpairs[max_p[0]][0][1] - else: - vstrand = e.v1.strand - a = dpairs[max_p[0]][0][0] - if hom >= 0: - if vstrand == 1: - hom_seq = a.query_sequence[a.query_alignment_end - hom : a.query_alignment_end] - else: - hom_seq = a.query_sequence[a.query_alignment_start : a.query_alignment_start + hom] - else: - if vstrand == 1: - hom_seq = a.query_sequence[a.query_alignment_end : a.query_alignment_end + abs(hom)] - else: - hom_seq = a.query_sequence[a.query_alignment_start - abs(hom) : a.query_alignment_start] - p1 = max_p[0][1] - p2 = max_p[0][2] - logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + " refine discordant edge found " - + str( - breakpoint_edge( - breakpoint_vertex(e.v1.chrom, p1, e.v1.strand), breakpoint_vertex(e.v2.chrom, p2, e.v2.strand) - ) - ) - + " " - + str(hom) - + " " - + str(len(dpairs[max_p[0]])) - + " " - + str(len(rSet)) - ) - return ( - breakpoint_edge( - breakpoint_vertex(e.v1.chrom, p1, e.v1.strand), - breakpoint_vertex(e.v2.chrom, p2, e.v2.strand), - hom=hom, - hom_seq=hom_seq, - ), - hom, - dpairs[max_p[0]], - hom_seq, - ) - - def edge_has_high_mapq(self, read_list): - bp1_mapq = max([rr[0].mapping_quality for rr in read_list]) - bp2_mapq = max([rr[1].mapping_quality for rr in read_list]) - logging.debug("#TIME " + "%.3f\t" % (time() - self.tstart) + " breakpoint_mapq: %d %d" % (bp1_mapq, bp2_mapq)) - if bp1_mapq < self.breakpoint_mapping_quality_cutoff: - return False - if bp2_mapq < self.breakpoint_mapping_quality_cutoff: - return False - return True - - def edge_has_high_entropy(self, read_list): - try: - bp1_entropy = max( - [ - stats.entropy( - np.unique([x for x in rr[0].get_reference_sequence().upper() if x != "N"], return_counts=True)[ - 1 - ] - ) - for rr in read_list - ] - ) - bp2_entropy = max( - [ - stats.entropy( - np.unique([x for x in rr[1].get_reference_sequence().upper() if x != "N"], return_counts=True)[ - 1 - ] - ) - for rr in read_list - ] - ) - except ValueError: - # if the MD tag is missing from the BAM file (e.g. Isaac was used as the aligner, or some CRAM files), instead use the query sequence for entropy calc. - bp1_entropy = max( - [ - stats.entropy( - np.unique([x for x in rr[0].query_alignment_sequence.upper() if x != "N"], return_counts=True)[ - 1 - ] - ) - for rr in read_list - ] - ) - bp2_entropy = max( - [ - stats.entropy( - np.unique([x for x in rr[1].query_alignment_sequence.upper() if x != "N"], return_counts=True)[ - 1 - ] - ) - for rr in read_list - ] - ) - - logging.debug( - "#TIME " + "%.3f\t" % (time() - TSTART) + " breakpoint_entropy: %.3f %.3f" % (bp1_entropy, bp2_entropy) - ) - - if bp1_entropy < self.breakpoint_entropy_cutoff: - return False - if bp2_entropy < self.breakpoint_entropy_cutoff: - return False - return True - - def edge_passes_filters(self, read_list, e=None): - logging.debug("#TIME " + "%.3f\t" % (time() - TSTART) + " edge_breakpoint_filter: " + str(e)) - if self.edge_has_high_mapq(read_list) and self.edge_has_high_entropy(read_list): - return True - return False - - def sa_tag_overlaps_primary(self, a): - if not a.has_tag("SA"): - return False - t = a.get_tag("SA").split(",") - if t[0] != a.reference_name: - return False - if (t[2] == "+") != a.is_reverse: - return False - if min(abs(int(t[1]) - a.reference_start), abs(int(t[1]) - a.reference_end)) > self.read_length: - return False - return True - - def sa_tag_mismatch_breakpoint(self, a, bp): - if not a.has_tag("SA"): - return False - t = a.get_tag("SA").split(",") - if t[0] != a.reference_name: - return True - if (t[2] == "+") != a.is_reverse: - return True - if bp.strand == -1 and (a.reference_start != bp.pos or int(t[1]) != bp.pos): - return True - if bp.strand == 1: - if abs(a.reference_end - bp.pos) > 10: - return True - cigar_counts = [int(i) for i in re.findall(r"\d+", t[3])] - cigar_op = [i for i in re.findall(r"\D", t[3])] - sa_ref_len = sum([i[0] for i in zip(cigar_counts, cigar_op) if i[1] in "MDNX"]) - if abs(int(t[1]) + sa_ref_len - bp.pos) > 10: - return True - return False - - def interval_discordant_edges(self, interval, filter_repeats=True, pair_support=-1, ms=None, amplicon_name=None): - logging.debug("#TIME " + "%.3f\t" % (time() - TSTART) + " searching discordant edges in " + str(interval)) - if pair_support == -1: - pair_support = self.pair_support - if type(interval) != hg.interval_list: - ilist = hg.interval_list([interval]) - else: - ilist = interval - if ( - tuple([(i.chrom, i.start, i.end) for i in ilist]), - filter_repeats, - pair_support, - not ms is None, - ) in self.discordant_edge_calls: - return self.discordant_edge_calls[ - (tuple([(i.chrom, i.start, i.end) for i in ilist]), filter_repeats, pair_support, not ms is None) - ] - - interval = ilist[0] - dflist = [] - drlist = [] - for i in ilist: - dflist += [ - a - for a in self.fetch(i.chrom, max(1, i.start), i.end) - if not a.is_unmapped - and not a.is_reverse - and a.is_paired - and not a.is_proper_pair - and not a.mate_is_unmapped - and not a.is_secondary - and a.reference_end is not None - and a.mapping_quality > self.mapping_quality_cutoff - and not ( - a.reference_name == a.next_reference_name - and a.mate_is_reverse - and abs(a.reference_start - a.next_reference_start) < self.max_insert - ) - ] # this section catches everted sequencing artifacts - drlist += [ - a - for a in self.fetch(i.chrom, max(1, i.start), i.end) - if not a.is_unmapped - and a.is_reverse - and a.is_paired - and not a.is_proper_pair - and not a.mate_is_unmapped - and not a.is_secondary - and a.reference_end is not None - and a.mapping_quality > self.mapping_quality_cutoff - and not ( - a.reference_name == a.next_reference_name - and not a.mate_is_reverse - and abs(a.reference_start - a.next_reference_start) < self.max_insert - ) - ] # this section catches everted sequencing artifacts - # logging.debug("#TIME " + '%.3f\t'%(time() - TSTART) + " discordant edges: fetch discordant " + str(interval) + " " + str(len(dflist)) + " " + str(len(drlist))) - # dflist = [a for a in dflist if not(a.reference_name == a.next_reference_name and a.mate_is_reverse and abs(a.template_length) < self.max_insert)] - # drlist = [a for a in drlist if not(a.reference_name == a.next_reference_name and not a.mate_is_reverse and abs(a.template_length) < self.max_insert)] - - # dflist = [a for a in dflist if not(a.reference_name == a.next_reference_name and a.mate_is_reverse and abs(a.reference_start - a.next_reference_start) < self.max_insert)] - # drlist = [a for a in drlist if not(a.reference_name == a.next_reference_name and not a.mate_is_reverse and abs(a.reference_start - a.next_reference_start) < self.max_insert)] - - logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + " discordant edges: discordant read pairs found: %s %s %s" % (str(interval), len(dflist), len(drlist)) - ) - - # perform biclustering for readpairs using union-find algorithm to give sets of connected read-pairs clist - vlist = [] - vcount = 0 - vdict = {} - for a in dflist + drlist: - vlist.append( - ( - hg.absPos(a.reference_name, a.reference_start) * (-1 if a.is_reverse else 1), - hg.absPos(a.next_reference_name, a.next_reference_start) * (-1 if a.mate_is_reverse else 1), - a, - vcount, - ) - ) - vdict[vcount] = a - vcount += 1 - # vlist = [(hg.absPos(a.reference_name, a.reference_start) * (-1 if a.is_reverse else 1), hg.absPos(a.next_reference_name, a.next_reference_start) * (-1 if a.mate_is_reverse else 1), a) for a in dflist + drlist] - v0list = copy.copy(vlist) - v0list.sort(key=lambda x: x[0]) - v1list = copy.copy(vlist) - v1list.sort(key=lambda x: x[1]) - dlist = [] - v0listp = [v[0] for v in v0list] - v1listp = [v[1] for v in v1list] - plist = defaultdict(lambda: None, {}) - rlist = defaultdict(lambda: 0, {}) - nlist = defaultdict(lambda: 1, {}) - # identify edges with bisect and union-find algorithm - # iii = 0 - for v in vlist: - # iii += 1 - s0 = bisect.bisect_left(v0listp, v[0] - self.max_insert + self.read_length) - e0 = bisect.bisect_right(v0listp, v[0] + self.max_insert - self.read_length) - s1 = bisect.bisect_left(v1listp, v[1] - self.max_insert + self.read_length) - e1 = bisect.bisect_right(v1listp, v[1] + self.max_insert - self.read_length) - SS0 = [vv[3] for vv in v0list[s0 : e0 + 1] if vv[3] > v[3]] - SS1 = [vv[3] for vv in v1list[s1 : e1 + 1] if vv[3] > v[3]] - SS0.sort() - SS1.sort() - SS_intersect = [] - i0 = 0 - i1 = 0 - while True: - if i0 == len(SS0) or i1 == len(SS1): - break - if SS0[i0] == SS1[i1]: - SS_intersect.append(SS0[i0]) - i0 += 1 - i1 += 1 - elif SS0[i0] < SS1[i1]: - i0 += 1 - else: - i1 += 1 - if len(SS_intersect) >= pair_support: - dlist.append(v[2]) - v1 = v[3] - for v2 in SS_intersect: - v1g = v1 - v2g = v2 - while plist[v1g] is not None: - v1g = plist[v1g] - while plist[v2g] is not None: - v2g = plist[v2g] - if v1g == v2g: - continue - if rlist[v1g] > rlist[v2g]: - plist[v2g] = v1g - rlist[v1g] = max(rlist[v1g], rlist[v2g] + 1) - nlist[v1g] += nlist[v2g] - else: - plist[v1g] = v2g - rlist[v2g] = max(rlist[v2g], rlist[v1g] + 1) - nlist[v2g] += nlist[v1g] - clist = defaultdict(lambda: [], {}) - for v in plist: - vg = v - while plist[vg] is not None: - vg = plist[vg] - clist[vdict[vg]].append(vdict[v]) - - mcdflist = [] - mcdrlist = [] - hgddict = {} - for c in clist: - if len(clist[c]) < pair_support: - continue - ml = clist[c] - if filter_repeats: - ml = [ - v - for v in clist[c] - if not hg.interval(v, bamfile=self.bamfile).filter_repeat() - and v.mapping_quality > self.mapping_quality_cutoff - ] - if len(ml) < pair_support: - continue - hgl = hg.interval_list([]) - for v in ml: - hgv = hg.interval(v, bamfile=self.bamfile) - hgddict[hgv] = v - hgl.append(hgv) - hgl.sort() - if c.is_reverse: - mcdrlist.extend(hgl.merge_clusters(extend=self.max_insert - self.read_length)) - else: - mcdflist.extend(hgl.merge_clusters(extend=self.max_insert - self.read_length)) - - # logging.debug("#TIME " + '%.3f\t'%(time() - TSTART) + " discordant edges: discordant clusters found: %s %d %d " % (str(interval), len(mcdflist), len(mcdrlist))) - - dnlist0 = [] - dnlist = [] - clist = hg.interval_list([c[0] for c in mcdflist + mcdrlist]) - clist.sort() - ci = 0 - for c1 in mcdflist + mcdrlist: - ci += 1 - neighbor_hglist = hg.interval_list([]) - for a1 in c1[1]: - neighbor_hglist.append( - hg.interval( - hgddict[a1].next_reference_name, - hgddict[a1].next_reference_start, - hgddict[a1].next_reference_start, - ) - ) - neighbor_hglist.sort() - neighbor_hglist = hg.interval_list( - [ - a2[0] - for a2 in neighbor_hglist.merge_clusters(extend=self.max_insert - self.read_length) - if len(a2[1]) >= pair_support - ] - ) - for c2 in mcdflist + mcdrlist: - if len(hg.interval_list([c2[0]]).intersection(neighbor_hglist, extend=self.max_insert)) == 0: - continue - vl = [] - vlSet = set() - vl1Set = set() - vl2Set = set() - for a1 in c1[1]: - for a2 in c2[1]: - aq1 = hgddict[a1] - aq2 = hgddict[a2] - if aq1.query_name == aq2.query_name and aq1.is_read1 != aq2.is_read1: - if ( - aq1.reference_name == aq2.reference_name - and abs(aq1.reference_start - aq2.reference_start) < self.read_length - and abs(aq1.reference_end - aq2.reference_end) < self.read_length - and aq1.is_reverse != aq2.is_reverse - ): - continue - if ( - aq1.reference_name == aq2.reference_name - and aq1.is_reverse - and not aq2.is_reverse - and aq1.reference_start - aq2.reference_end + 1 > 0 - and aq1.reference_start - aq2.reference_end + 1 < self.max_insert - 2 * self.read_length - ): - continue - if ( - aq2.reference_name == aq1.reference_name - and aq2.is_reverse - and not aq1.is_reverse - and aq2.reference_start - aq1.reference_end + 1 > 0 - and aq2.reference_start - aq1.reference_end + 1 < self.max_insert - 2 * self.read_length - ): - continue - vl.append((aq1, aq2)) - vlSet.add((aq1.reference_start, aq1.reference_end, aq2.reference_start, aq2.reference_end)) - vl1Set.add((aq1.reference_start, aq1.reference_end)) - vl2Set.add((aq2.reference_start, aq2.reference_end)) - if len(vl) == 0 or len([v for v in vl if v[1].reference_start * v[0].reference_start > 0]) == 0: - continue - if not vl[0][0].is_reverse: - bp1 = breakpoint_vertex( - c1[0].chrom, max([v[0].reference_end - 1 for v in vl if v[0].reference_start > 0]), 1 - ) - else: - bp1 = breakpoint_vertex( - c1[0].chrom, min([v[0].reference_start for v in vl if v[0].reference_start > 0]), -1 - ) - if not vl[0][1].is_reverse: - bp2 = breakpoint_vertex( - c2[0].chrom, max([v[1].reference_end - 1 for v in vl if v[1].reference_start > 0]), 1 - ) - else: - bp2 = breakpoint_vertex( - c2[0].chrom, min([v[1].reference_start for v in vl if v[1].reference_start > 0]), -1 - ) - if ms is None: - ps = pair_support - else: - ps = self.pair_support_count(bp1.chrom, bp1.pos, bp1.strand, ms) - if len(vl) < ps or len(vl1Set) < pair_support or len(vl2Set) < pair_support: - continue - - if ( - bp1.chrom == bp2.chrom - and bp1.pos == bp2.pos - and bp1.strand == bp2.strand - and len(vl) < 2 * self.pair_support - ): - continue - - num_inverted = 0 - bp1c = None - bp2c = None - vl2 = [] - if bp1.chrom == bp2.chrom and bp1.strand == bp2.strand and abs(bp1.pos - bp2.pos) <= self.read_length: - non_inverted_reads = set() - multiple_non_inverted = False - if bp1.strand == 1: - for v in vl: - if v[0].reference_start == v[1].reference_start: - num_inverted += 1 - elif self.sa_tag_overlaps_primary(v[0]): - num_inverted += 1 - elif self.sa_tag_overlaps_primary(v[1]): - num_inverted += 1 - else: - vl2.append(v) - if not multiple_non_inverted: - non_inverted_reads.add(v[0].query_name) - if len(non_inverted_reads) >= ps: - multiple_non_inverted = True - else: - for v in vl: - if v[0].reference_end == v[1].reference_end: - num_inverted += 1 - elif self.sa_tag_overlaps_primary(v[0]): - num_inverted += 1 - elif self.sa_tag_overlaps_primary(v[1]): - num_inverted += 1 - else: - vl2.append(v) - if not multiple_non_inverted: - non_inverted_reads.add(v[0].query_name) - if len(non_inverted_reads) >= ps: - multiple_non_inverted = True - logging.debug( - "checking foldback2: " - + str(bp1) - + str(bp2) - + " %s %s %d %d %d" % (bp1.strand, bp2.strand, len(vl), num_inverted, ps) - ) - - if len(vl2) < ps or (not multiple_non_inverted): - logging.debug("FOLDBACK: " + str(bp1) + str(bp2)) - continue - vl = vl2 - vl.sort(key=lambda x: x[0].reference_start - x[1].reference_start) - if bp1.strand == 1: - maxp = vl[0][0].reference_end - 1 - maxn = 0 - for v in vl[::-1]: - if ( - len( - [ - v1 - for v1 in vl - if v1[0].reference_end <= v[0].reference_end - and v1[0].reference_start - > v[0].reference_end - 1 - self.max_insert + 2 * self.read_length - ] - ) - > maxn - ): - maxp = v[0].reference_end - maxn = len( - [ - v1 - for v1 in vl - if v1[0].reference_end <= v[0].reference_end - and v1[0].reference_end - > v[0].reference_end - self.max_insert + 2 * self.read_length - ] - ) - vl = [ - v - for v in vl - if v[0].reference_end - 1 <= maxp - and v[0].reference_end - 1 > maxp - self.max_insert + 2 * self.read_length - ] - if len(vl) < ps: - continue - bp1 = breakpoint_vertex( - c1[0].chrom, max([v[0].reference_end - 1 for v in vl if v[0].reference_start > 0]), 1 - ) - bp2 = breakpoint_vertex( - c2[0].chrom, max([v[1].reference_end - 1 for v in vl if v[1].reference_start > 0]), 1 - ) - if bp1.pos != bp2.pos: - bp1c = bp2 - bp2c = bp1 - else: - maxp = vl[-1][0].pos - maxn = 0 - for v in vl: - if ( - len( - [ - v1 - for v1 in vl - if v1[0].reference_start >= v[0].reference_start - and v1[0].reference_start - < v[0].reference_start + self.max_insert - 2 * self.read_length - ] - ) - > maxn - ): - maxp = v[0].reference_start - maxn = len( - [ - v1 - for v1 in vl - if v1[0].reference_start >= v[0].reference_start - and v1[0].reference_start - < v[0].reference_start + self.max_insert - 2 * self.read_length - ] - ) - vl = [ - v - for v in vl - if v[0].reference_start >= maxp - and v[0].reference_start < maxp + self.max_insert - 2 * self.read_length - ] - if len(vl) < ps: - continue - bp1 = breakpoint_vertex( - c1[0].chrom, min([v[0].reference_start for v in vl if v[0].reference_start > 0]), -1 - ) - bp2 = breakpoint_vertex( - c2[0].chrom, min([v[1].reference_start for v in vl if v[1].reference_start > 0]), -1 - ) - if bp1.pos != bp2.pos: - bp1c = bp2 - bp2c = bp1 - bre_refine = self.refine_discordant_edge(breakpoint_edge(bp1, bp2)) - bre = bre_refine[0] - - if bp1.chrom == bp2.chrom and bp1.strand == bp2.strand and abs(bp1.pos - bp2.pos) <= self.read_length: - qname_exclude = set([]) - for v in vl: - if (bp1.strand == 1 and max(v[0].reference_start, v[1].reference_start) > bre.v1.pos) or ( - bp1.strand == -1 and max(v[0].reference_end, v[1].reference_end) < bre.v1.pos - ): - qname_exclude.add(v[0].query_name) - continue - if ( - self.sa_tag_mismatch_breakpoint(v[0], bre.v1) - or self.sa_tag_mismatch_breakpoint(v[0], bre.v1) - or self.sa_tag_overlaps_primary(v[0]) - or self.sa_tag_overlaps_primary(v[1]) - ): - qname_exclude.add(v[0].query_name) - continue - if ( - bp1.strand == 1 - and bre.v1.pos - v[0].reference_start + bre.v2.pos - v[1].reference_start > self.max_insert - ): - qname_exclude.add(v[0].query_name) - continue - if ( - bp2.strand == 1 - and v[0].reference_end - bre.v1.pos + v[1].reference_end - bre.v2.pos > self.max_insert - ): - qname_exclude.add(v[0].query_name) - continue - vl = [v for v in vl if v[0].query_name not in qname_exclude] - if len(vl) < ps: - continue - - if bre.type() == "everted" and abs(bre.v1.pos - bre.v2.pos) < self.max_insert: - logging.debug("skipping everted edge " + str(bp1) + str(bp2)) - continue - if bre.type() != "concordant": - if self.edge_passes_filters(vl, bre): - dnlist0.append((bre, len(vl))) - if bp1c is not None and bp2c is not None: - brec_refine = self.refine_discordant_edge(breakpoint_edge(bp1c, bp2c)) - brec = brec_refine[0] - if brec.type() != "concordant" and brec.v1.pos != brec.v2.pos: - if self.edge_passes_filters(vl, brec): - dnlist0.append((brec, len([(v[1], v[0]) for v in vl]))) - - # remove local edges with no complementary edges and add warning if any found - for bb1 in dnlist0: - for bb2 in dnlist0: - bre1 = bb1[0] - bre2 = bb2[0] - if bre1 == bre2 and (bre1.v1.chrom, bre1.v1.pos, bre1.v1.strand) != ( - bre1.v2.chrom, - bre1.v2.pos, - bre1.v2.strand, - ): - continue - if ( - (bre2.v2.chrom, bre2.v2.pos, bre2.v2.strand) == (bre1.v1.chrom, bre1.v1.pos, bre1.v1.strand) - and (bre2.v1.chrom, bre2.v1.pos, bre2.v1.strand) == (bre1.v2.chrom, bre1.v2.pos, bre1.v2.strand) - ) and bb1 not in dnlist: - dnlist.append(bb1) - continue - if len(dnlist) != len(dnlist0): - logging.debug("dnlists do not match " + str(len(dnlist0)) + " " + str(len(dnlist))) - for bb1 in dnlist0: - if bb1 not in dnlist: - logging.debug("dnlist0: " + str(bb1[0]) + " " + str(bb1[1])) - for bb1 in dnlist: - if bb1 not in dnlist0: - logging.debug("dnlist: " + str(bb1[0]) + " " + str(bb1[1])) - - logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + " discordant edges: local edges done " - + str(interval) - + " " - + str(len(mcdflist)) - + " " - + str(len(mcdrlist)) - + " " - + str(len(dnlist)) - ) - self.get_mates_time = 0 - self.get_mates_num_calls = 0 - for c in mcdflist + mcdrlist: - nlist = [] - if filter_repeats: - if len(hg.interval_list([c[0]]).intersection(hg.conserved_regions)) > 0: - continue - rep_content_time = 0 - intersection_time = 0 - nr_calls = 0 - for hga in c[1]: - nmatelist = self.get_mates(hgddict[hga]) - if filter_repeats: - rpc = time() - nmatelist = [ - a - for a in nmatelist - if not hg.interval(a, bamfile=self.bamfile).filter_repeat() - and a.mapping_quality > self.mapping_quality_cutoff - ] - nr_calls += len(nmatelist) - rep_content_time += time() - rpc - ict = time() - nmatelist = [ - a - for a in nmatelist - if len(hg.interval_list([hg.interval(a, bamfile=self.bamfile)]).intersection(ilist)) == 0 - ] - intersection_time += time() - ict - nlist += nmatelist - nflist = [n for n in nlist if not n.is_reverse] - nrlist = [n for n in nlist if n.is_reverse] - hgndict = {hg.interval(a, bamfile=self.bamfile): a for a in nflist + nrlist} - hgnflist = hg.interval_list([hga for hga in hgndict if hga.strand == 1]) - hgnrlist = hg.interval_list([hga for hga in hgndict if hga.strand == -1]) - hgnflist.sort() - hgnrlist.sort() - mcnflist = hgnflist.merge_clusters(self.max_insert - 2 * self.read_length) - mcnrlist = hgnrlist.merge_clusters(self.max_insert - 2 * self.read_length) - mcnflist = [m for m in mcnflist if len(m[1]) >= pair_support] - mcnrlist = [m for m in mcnrlist if len(m[1]) >= pair_support] - mcnlist = mcnflist + mcnrlist - for cn in mcnlist: - vl = [] - vlSet = set() - vl1Set = set() - vl2Set = set() - if filter_repeats: - if len(hg.interval_list([cn[0]]).intersection(hg.conserved_regions)) > 0: - continue - hgmi = 0 - for hgm in cn[1]: - hgmi += 1 - if filter_repeats: - if hgm.filter_repeat() or hgndict[hgm].mapping_quality <= self.mapping_quality_cutoff: - continue - for a in self.get_mates(hgndict[hgm]): - if filter_repeats: - if ( - hg.interval(a, bamfile=self.bamfile).filter_repeat() - or a.mapping_quality <= self.mapping_quality_cutoff - ): - continue - if hg.interval(a, bamfile=self.bamfile).intersects(c[0]): - vl.append((a, hgndict[hgm])) - vlSet.add( - ( - a.reference_start, - a.reference_end, - hgndict[hgm].reference_start, - hgndict[hgm].reference_end, - ) - ) - vl1Set.add((a.reference_start, a.reference_end)) - vl2Set.add((hgndict[hgm].reference_start, hgndict[hgm].reference_end)) - break - if len(vl) == 0 or len([v for v in vl if v[1].reference_start * v[0].reference_start > 0]) == 0: - continue - if not vl[0][0].is_reverse: - bp1 = breakpoint_vertex(vl[0][0].reference_name, max([v[0].reference_end - 1 for v in vl]), 1) - else: - bp1 = breakpoint_vertex( - vl[0][0].reference_name, min([v[0].reference_start for v in vl if v[0].reference_start > 0]), -1 - ) - if not vl[0][1].is_reverse: - bp2 = breakpoint_vertex(vl[0][1].reference_name, max([v[1].reference_end - 1 for v in vl]), 1) - else: - bp2 = breakpoint_vertex( - vl[0][1].reference_name, min([v[1].reference_start for v in vl if v[1].reference_start > 0]), -1 - ) - if ms is None: - ps = pair_support - else: - ps = self.pair_support_count(bp1.chrom, bp1.pos, bp1.strand, ms) - - if len(vl) < ps or len(vl1Set) < pair_support or len(vl2Set) < pair_support: - continue - num_inverted = 0 - non_inverted_reads = set() - multiple_non_inverted = False - if bp1.chrom == bp2.chrom and bp1.pos == bp2.pos and bp1.strand == bp2.strand: - if bp1.strand == 1: - for v in vl: - if v[0].reference_start == v[1].reference_start: - num_inverted += 1 - elif not multiple_non_inverted: - non_inverted_reads.add(v[0].query_name) - if len(non_inverted_reads) >= ps: - multiple_non_inverted = True - else: - for v in vl: - if v[0].reference_end == v[1].reference_end: - num_inverted += 1 - elif not multiple_non_inverted: - non_inverted_reads.add(v[0].query_name) - if len(non_inverted_reads) >= ps: - multiple_non_inverted = True - if len(vl) - num_inverted < ps or (not multiple_non_inverted): - continue - bre_refine = self.refine_discordant_edge(breakpoint_edge(bp1, bp2)) - bre = bre_refine[0] - if bre.type() != "concordant": - if self.edge_passes_filters(vl, bre): - dnlist.append((bre, len(vl))) - logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + " discordant edges: external edges done " - + str(interval) - + " " - + str(self.get_mates_time) - + " " - + str(self.get_mates_num_calls) - ) - dnlist.sort(key=lambda x: hg.absPos(x[0].v1.chrom, x[0].v1.pos) + 0.5 * x[0].v1.strand) - for e in dnlist: - logging.debug( - "#TIME %.3f\tdiscordant edges %s %s %s %s %d %f" - % ( - time() - TSTART, - e[0], - e[1], - e[0].type(), - self.concordant_edge(e[0].v1)[0], - len(self.concordant_edge(e[0].v1)[1]), - hg.interval( - e[0].v1.chrom, e[0].v1.pos, e[0].v1.pos - e[0].v1.strand * self.max_insert - ).rep_content(), - ) - ) - self.discordant_edge_calls[ - (tuple([(i.chrom, i.start, i.end) for i in ilist]), filter_repeats, pair_support, not ms is None) - ] = dnlist - return dnlist - - def load_edges(self, edge_file): - edge_lines = [line.strip().split() for line in open(edge_file)] - edges = [] - for el in edge_lines: - if el[2] == "None": - hom = None - hom_seq = None - else: - hom = int(el[2]) - if hom != 0: - hom_seq = el[3] - else: - hom_seq = "" - e = breakpoint_edge(el[0], hom=hom, hom_seq=hom_seq) - edges.append((e, int(el[1]))) - edges.sort(key=lambda x: hg.absPos(x[0].v1.chrom, x[0].v1.pos) + 0.1 * x[0].v1.strand) - return edges - - def get_sensitive_discordant_edges( - self, - ilist, - msrlist, - eilist=None, - filter_repeats=True, - pair_support=-1, - ms_window_size0=10000, - ms_window_size1=300, - adaptive_counts=True, - gcc=False, - amplicon_name=None, - ): - if amplicon_name is not None and os.path.exists("%s_edges_cnseg.txt" % amplicon_name): - return self.load_edges("%s_edges_cnseg.txt" % amplicon_name) - if amplicon_name is not None and os.path.exists("%s_edges.txt" % amplicon_name): - eilist = self.load_edges("%s_edges.txt" % amplicon_name) - else: - if eilist is None: - if adaptive_counts: - eilist = self.interval_discordant_edges(ilist, ms=msrlist, pair_support=pair_support) - else: - eilist = self.interval_discordant_edges(ilist, pair_support=pair_support) - eilist.sort(key=lambda x: hg.absPos(x[0].v1.chrom, x[0].v1.pos) + 0.1 * x[0].v1.strand) - if amplicon_name is not None: - edge_file = open("%s_edges.txt" % amplicon_name, "w") - for e in eilist: - edge_file.write("%s\t%s\t%s\t%s\n" % (str(e[0]), e[1], e[0].hom, e[0].hom_seq)) - edge_file.close() - eiSet = set( - [(e[0].v1.chrom, e[0].v1.pos, e[0].v1.strand, e[0].v2.chrom, e[0].v2.pos, e[0].v2.strand) for e in eilist] - ) - for i, msr in zip(ilist, msrlist): - elist = [] - for e in eilist: - if e[0].v1.pos != -1 and hg.interval(e[0].v1.chrom, e[0].v1.pos, e[0].v1.pos).intersects(i): - elist.append(e) - ms_vlist = [] - msv_index = {} - for msi in range(len(msr) - 1): - if msr[msi + 1].info["cn"] < msr[msi].info["cn"]: - msv = breakpoint_vertex(i.chrom, msr[msi].end, 1) - else: - msv = breakpoint_vertex(i.chrom, msr[msi].end + 1, -1) - ms_vlist.append(msv) - msv_index[msv] = msi - print("Meanshift", str(i), len(ms_vlist), ms_vlist) - sys.stdout.flush() - for msv in ms_vlist: - msi = msv_index[msv] - if ("end_refined" in msr[msi].info) and msr[msi].info["end_refined"]: - msve = [ - e - for e in elist - if e[0].v1.strand * (msr[msi].info["cn"] - msr[msi + 1].info["cn"]) > 0 - and abs(e[0].v1.pos - msr[msi].end) < self.max_insert + ms_window_size1 - ] - if len(msve) == 0: - # print("finesearch discordant edges", i.chrom, str(msr[msi]), str(msr[msi + 1])) - efine = self.interval_discordant_edges( - hg.interval( - i.chrom, - msv.pos - ms_window_size0 - self.max_insert, - msv.pos + ms_window_size1 + self.max_insert, - ), - pair_support=2, - ) - if ( - len( - [ - e - for e in efine - if e[0].v1.strand * (msr[msi].info["cn"] - msr[msi + 1].info["cn"]) > 0 - ] - ) - > 0 - ): - if ( - len( - [ - (e[1], e[0]) - for e in efine - if e[0].v1.strand * (msr[msi].info["cn"] - msr[msi + 1].info["cn"]) > 0 - and abs(e[0].v1.pos - msv.pos) < ms_window_size1 - ] - ) - > 0 - ): - ebest = max( - [ - (e[1], e[0]) - for e in efine - if e[0].v1.strand * (msr[msi].info["cn"] - msr[msi + 1].info["cn"]) > 0 - and abs(e[0].v1.pos - msv.pos) < ms_window_size1 - ] - ) - else: - ebest = max( - [ - (e[1], e[0]) - for e in efine - if e[0].v1.strand * (msr[msi].info["cn"] - msr[msi + 1].info["cn"]) > 0 - ] - ) - ebest = (ebest[1], ebest[0]) - # msve = [ebest] - - # print("finesearch discordant edge found", i.chrom, str(msr[msi]), str(msr[msi + 1]), str(ebest[0]), ebest[1]) - if ( - ebest[0].v1.chrom, - ebest[0].v1.pos, - ebest[0].v1.strand, - ebest[0].v2.chrom, - ebest[0].v2.pos, - ebest[0].v2.strand, - ) not in eiSet: - elist.append(ebest) - eilist.append(ebest) - eiSet.add( - ( - ebest[0].v1.chrom, - ebest[0].v1.pos, - ebest[0].v1.strand, - ebest[0].v2.chrom, - ebest[0].v2.pos, - ebest[0].v2.strand, - ) - ) - if ( - len( - hg.interval_list( - [hg.interval(ebest[0].v2.chrom, ebest[0].v2.pos, ebest[0].v2.pos)] - ).intersection(ilist) - ) - > 0 - ): - if ( - ebest[0].v2.chrom, - ebest[0].v2.pos, - ebest[0].v2.strand, - ebest[0].v1.chrom, - ebest[0].v1.pos, - ebest[0].v1.strand, - ) not in eiSet: - eilist.append((breakpoint_edge(ebest[0].v2, ebest[0].v1), ebest[1])) - eiSet.add( - ( - ebest[0].v2.chrom, - ebest[0].v2.pos, - ebest[0].v2.strand, - ebest[0].v1.chrom, - ebest[0].v1.pos, - ebest[0].v1.strand, - ) - ) - elist.sort(key=lambda x: hg.absPos(x[0].v1.chrom, x[0].v1.pos) + 0.1 * x[0].v1.strand) - eilist.sort(key=lambda x: hg.absPos(x[0].v1.chrom, x[0].v1.pos) + 0.1 * x[0].v1.strand) - - # Maintainer found the following lines had no effect - # else: - # # print("msv end not refined", str(msr[msi]), str(msr[msi + 1])) - # msve = [e for e in elist if e[0].v1.strand * (msr[msi].info['cn'] - msr[msi + 1].info['cn']) > 0 and abs( - # e[0].v1.pos - msr[msi].end) < self.max_insert + ms_window_size0] - - if amplicon_name is not None: - edge_file = open("%s_edges_cnseg.txt" % amplicon_name, "w") - for e in eilist: - edge_file.write("%s\t%s\t%s\t%s\n" % (str(e[0]), e[1], e[0].hom, e[0].hom_seq)) - edge_file.close() - return eilist - - def construct_segment(self, v): - cpos = v.pos - v.strand * self.max_insert / 2 - cprevious = v.pos - cflag = v.pos - while abs(cflag - cprevious) < self.window_size: - cprevious = cpos - cpos = cpos - v.strand * self.max_insert / 2 - drange = [cpos, cpos + v.strand * self.max_insert] - drange.sort() - dlist = [a for a in self.fetch(v.chrom, drange[0], drange[1])] - if len(dlist) * self.read_length < self.min_coverage * self.max_insert: - continue - cflag = cprevious - if abs(cprevious - v.pos) > self.max_insert: - v1 = breakpoint_vertex(v.chrom, cprevious, v.strand) - discordant_neighbors = self.get_discordant_neighbors(v1) - if len(discordant_neighbors) > 0: - return v1 - v2 = breakpoint_vertex(v.chrom, cpos, -1 * v.strand) - discordant_neighbors = self.get_discordant_neighbors(v2) - if len(discordant_neighbors) > 0: - return v2 - return None - - # Methods to find all intervals in amplicon - def interval_neighbors(self, i, ilist=[], rdlist=[], t=0, gcc=False): - i2 = self.interval_extend(i) - # i2 = i - # i2 = self.interval_extend(i, ilist, rdlist) - ms_window_size0 = 10000 - ms_window_size1 = 300 - merge_thresh = 100000 - logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + " Calculating coverage meanshift segmentation") - msrlist = [self.get_meanshift(i2, ms_window_size0, ms_window_size1, gcc)] - logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + " Detecting breakpoint edges (interval neighbors)") - edges = self.interval_discordant_edges(i2, ms=msrlist) - edges = [(e[1], e[0]) for e in edges] - edges.sort(reverse=True) - edges = [(e[1], e[0]) for e in edges] - ei = 0 - logging.info("#TIME " + "%.3f\t" % (time() - TSTART) + " Selecting neighbors") - neighbors = hg.interval_list([]) - while len(neighbors) < 10 and ei < len(edges): - covered = False - for i3 in ilist + neighbors: - if ( - i3.chrom == edges[ei][0].v2.chrom - and edges[ei][0].v2.pos >= i3.start - and edges[ei][0].v2.pos <= i3.end - ): - ei += 1 - covered = True - break - if covered: - continue - found_neighbor = False - for i3 in rdlist: - if ( - i3.chrom == edges[ei][0].v2.chrom - and edges[ei][0].v2.pos >= i3.start - and edges[ei][0].v2.pos <= i3.end - ): - # n = i3 - n = hg.interval(i3.chrom, i3.start, i3.end) - found_neighbor = True - if not found_neighbor: - if edges[ei][0].v2.strand < 0: - n = self.interval_extend( - hg.interval( - edges[ei][0].v2.chrom, - edges[ei][0].v2.pos, - min(hg.chrLen[hg.chrNum(edges[ei][0].v2.chrom)] - 1, edges[ei][0].v2.pos + self.max_insert), - ) - ) - else: - n = self.interval_extend( - hg.interval( - edges[ei][0].v2.chrom, max(0, edges[ei][0].v2.pos - self.max_insert), edges[ei][0].v2.pos - ) - ) - if found_neighbor or n.size() > self.max_insert + 2: - n.info = edges[ei][1] - neighbors.append(n) - ei += 1 - neighbors.sort() - mc = neighbors.merge_clusters(extend=merge_thresh) # previously ms_window_size0 - for c in mc: - c[0].info = sum([c1.info for c1 in c[1]]) - nn = hg.interval_list([c[0] for c in mc]) - for e in nn: - logging.debug( - "#TIME " + "%.3f\t" % (time() - TSTART) + " interval_neighbors: edges %s %s" % (str(i), str(e)) - ) - return nn - - def interval_hops(self, i=None, ilist=[], rdlist=[], gcc=False, explore=True): - if type(i) == list or type(i) == hg.interval_list: - i1list = i - i = i[0] - else: - i1list = hg.interval_list([i]) - logging.debug("#TIME " + "%.3f\t" % (time() - TSTART) + " interval_hops: init " + str(i)) - ms_window_size0 = 10000 - i2list = hg.interval_list([]) - for i2 in i1list: - ii = self.interval_extend(i2) - logging.debug( - "#TIME " + "%.3f\t" % (time() - TSTART) + " interval_hops: interval extend " + str(i2) + " " + str(ii) - ) - i2list.append(ii) - seen_list = hg.interval_list([]) - unseen_list = [(0, ii) for ii in i2list] - heapq.heapify(unseen_list) - # clist = hg.interval_list(i2list) - clist = hg.interval_list([ii[0] for ii in i2list.merge_clusters(extend=1)]) - while len(seen_list) < 10 and len(unseen_list) > 0: - icc = heapq.heappop(unseen_list) - ic = icc[1] - if explore == False and len(hg.interval_list([ic]).intersection(i2list)) == 0: - seen_list.append(ic) - continue - logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + " interval_hops: check rd " - + str(i) - + " " - + str(ic) - + " " - + str(len(hg.interval_list([ic]).intersection(rdlist))) - ) - if ( - len(hg.interval_list([ic]).intersection(i2list)) == 0 - and len(hg.interval_list([ic]).intersection(rdlist)) > 0 - ): - seen_list.append(ic) - continue - # logging.debug("#TIME " + '%.3f\t'%(time() - TSTART) + " interval_hops: search new " + str(i) + " " + str(ic)) - # logging.info("#TIME " + '%.3f\t'%(time() - TSTART) + " Searching new neighbors for interval: " + str(ic)) - icn = self.interval_neighbors(ic, clist, rdlist=rdlist, gcc=gcc) - logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + " interval_hops: neighbors " - + str(i) - + " " - + str(ic) - + " " - + str(len(icn)) - ) - for ic2 in icn: - logging.info( - "#TIME " + "%.3f\t" % (time() - TSTART) + " New neighbor: %s (weight=%d)" % (str(ic2), ic2.info) - ) - contained = False - for i2 in clist: - if i2.contains(ic2): - contained = True - if contained: - continue - if ic2.size() < 2 * ms_window_size0 and len(self.interval_discordant_edges(ic2)) < 2: - continue - if explore or len(hg.interval_list([ic]).intersection(i2list)) > 0: - heapq.heappush(unseen_list, (-ic2.info, ic2)) - clist.append(ic2) - seen_list.append(ic) - retlist = hg.interval_list(i2list + seen_list) - retlist = [r[0] for r in retlist.merge_clusters(extend=1)] - return retlist - - def interval_amplified(self, i, filter_conserved=True, filter_small=True): - if ( - len( - hg.interval_list([i]).intersection(hg.conserved_regions) - + hg.interval_list([i]).intersection(hg.centromere_list) - ) - > 0 - ): - return False - ms_window_size = 10000 - num_w = 0 - num_high = 0 - if filter_small and i.size() < 2 * ms_window_size and len(self.interval_discordant_edges(i)) < 2: - return False - wc = self.window_coverage(i, ms_window_size, exact=False) - mc = self.median_coverage() - if self.span_coverage: - arm_coverage = self.median_coverage(refi=i) - else: - arm_coverage = mc - for w in wc: - num_w += 1 - # if w[1] > mc[0] + 3 * mc[2]: - if self.sensitivems == False: - if mc[0] < arm_coverage[0] and w[1] > max( - arm_coverage[0] + 3 * mc[2] * math.sqrt(arm_coverage[0] / mc[0]), - arm_coverage[0] + 3.0 * mc[0] / 2.0, - ): - num_high += 1 - elif mc[0] >= arm_coverage[0] and w[1] > max(mc[0] + 3 * mc[2], 5.0 * mc[0] / 2.0): - num_high += 1 - else: - if mc[0] < arm_coverage[0] and w[1] > arm_coverage[0] + 3 * mc[2] * math.sqrt(arm_coverage[0] / mc[0]): - num_high += 1 - elif mc[0] >= arm_coverage[0] and w[1] > mc[0] + 3 * mc[2]: - num_high += 1 - # wc_high = len([w for w in wc if w[1] > mc[1] + 3 * mc[2]]) - if num_high > num_w / 5: - return True - elif filter_small == False and i.size() < 2 * ms_window_size and len(self.interval_discordant_edges(i)) >= 2: - return True - else: - return False - - def interval_extend(self, i, strand=0, i0=None): - ms_window_size = 10000 - extend_size = max(i.size() / ms_window_size, 1) - max_window_size = 300000000 - if strand >= 0: - extend_right = 1 - right_size = extend_size - else: - extend_right = -1 - right_size = 0 - if strand <= 0: - extend_left = 1 - left_size = extend_size - else: - extend_left = -1 - left_size = 0 - ic = copy.copy(i) - while ic.size() < max_window_size and (extend_left >= 0 or extend_right >= 0): - if extend_right >= 0: - if right_size < 1: - extend_right = -1 - elif ic.end + right_size * ms_window_size > hg.chrLen[hg.chrNum(ic.chrom)]: - if self.interval_amplified( - hg.interval(ic.chrom, ic.end, hg.chrLen[hg.chrNum(ic.chrom)]), filter_small=False - ): - ic.end = hg.chrLen[hg.chrNum(ic.chrom)] - extend_right = -1 - else: - extend_right = 0 - right_size = right_size / 2 - elif self.interval_amplified( - hg.interval(ic.chrom, ic.end, ic.end + right_size * ms_window_size), filter_small=False - ): - ic.end = ic.end + right_size * ms_window_size - if extend_right == 1: - right_size = 2 * right_size - else: - right_size = right_size / 2 - if right_size < 1: - # ic.end = min(ic.end + ms_window_size, hg.chrLen[hg.chrNum(ic.chrom)]) - extend_right = -1 - else: - extend_right = 0 - right_size = right_size / 2 - if extend_left >= 0: - if left_size < 1: - extend_left = -1 - elif ic.start - left_size * ms_window_size <= 1: - if self.interval_amplified(hg.interval(ic.chrom, 1, ic.start), filter_small=False): - ic.start = 1 - extend_left = -1 - else: - extend_left = 0 - left_size = left_size / 2 - elif self.interval_amplified( - hg.interval(ic.chrom, ic.start - left_size * ms_window_size, ic.start), filter_small=False - ): - ic.start = ic.start - left_size * ms_window_size - if extend_left == 1: - left_size = 2 * left_size - else: - left_size = left_size / 2 - if left_size < 1: - # ic.start = max(ic.end - ms_window_size, 1) - extent_left = -1 - else: - extend_left = 0 - left_size = left_size / 2 - if self.interval_amplified( - hg.interval( - ic.chrom, - max(0, ic.end - 2 * ms_window_size), - min(ic.end + 2 * ms_window_size, hg.chrLen[hg.chrNum(ic.chrom)]), - ), - filter_small=False, - ): - ic.end = min(ic.end + 10 * ms_window_size, hg.chrLen[hg.chrNum(ic.chrom)]) - if self.interval_amplified( - hg.interval( - ic.chrom, - max(ic.start - 2 * ms_window_size, 0), - min(ic.start + 2 * ms_window_size, hg.chrLen[hg.chrNum(ic.chrom)]), - ), - filter_small=False, - ): - ic.start = max(ic.start - 10 * ms_window_size, 0) - if strand >= 0: - ide = self.interval_discordant_edges( - hg.interval(ic.chrom, ic.end + 1, min(hg.chrLen[hg.chrNum(ic.chrom)], ic.end + ms_window_size)) - ) - for e in ide: - if e[0].v1.strand == 1: - ic.end = min(ic.end + 2 * ms_window_size, hg.chrLen[hg.chrNum(ic.chrom)]) - break - if strand <= 0: - ide = self.interval_discordant_edges(hg.interval(ic.chrom, max(0, ic.start - ms_window_size), ic.start - 1)) - for e in ide: - if e[0].v1.strand == -1: - ic.start = max(ic.start - 2 * ms_window_size, 0) - break - # if ic.size() > ms_window_size: - logging.debug( - "#TIME " + "%.3f\t" % (time() - TSTART) + " interval_extend: %s, %s, %s" % (str(i), strand, str(ic)) - ) - return ic - - # Method to create breakpoint graph, find network flow and cycle decomposition - def interval_filter_vertices( - self, ilist0, gcc=False, adaptive_counts=True, eilist=None, amplicon_name=None, runmode="FULL" - ): - ms_window_size0 = 10000 - ms_window_size1 = 300 - ilist0.sort() - ilist = hg.interval_list([a[0] for a in ilist0.merge_clusters()]) - - # finesearch edges near refined meanshifts and add to eilist, create vertices corresponding to all meanshifts and uncovered meanshifts - all_msv = [] - msv_diff = {} - all_msv_nocover = [] - logging.info("#TIME " + "%.3f\t" % (time() - self.tstart) + " Calculating coverage meanshift segmentation") - msrlist = [self.get_meanshift(i, ms_window_size0, ms_window_size1, gcc) for i in ilist] - logging.info( - "#TIME " + "%.3f\t" % (time() - self.tstart) + " Detecting breakpoint edges (interval filter vertices)" - ) - sensitive_elist = self.get_sensitive_discordant_edges( - ilist, - msrlist, - eilist, - ms_window_size0=ms_window_size0, - ms_window_size1=ms_window_size1, - adaptive_counts=adaptive_counts, - amplicon_name=amplicon_name, - ) - eilist = sensitive_elist - logging.info("#TIME " + "%.3f\t" % (time() - self.tstart) + " Building breakpoint graph") - for i, msr in zip(ilist, msrlist): - elist = [] - for e in eilist: - if hg.interval(e[0].v1.chrom, e[0].v1.pos, e[0].v1.pos).intersects(i): - elist.append(e) - - ms_vlist = [] - msv_index = {} - for msi in range(len((msr)) - 1): - if msr[msi + 1].info["cn"] < msr[msi].info["cn"]: - msv = breakpoint_vertex(i.chrom, msr[msi].end, 1) - else: - msv = breakpoint_vertex(i.chrom, msr[msi].end + 1, -1) - msv_diff[msv] = msr[msi + 1].info["cn"] - msr[msi].info["cn"] - ms_vlist.append(msv) - msv_index[msv] = msi - all_msv.append(ms_vlist) - # logging.debug("Meanshift", str(i), len(ms_vlist), ms_vlist) - msve_match = {} - for msv in ms_vlist: - msi = msv_index[msv] - if msr[msi].info["end_refined"]: - msve = [ - e - for e in elist - if e[0].v1.strand * (msr[msi + 1].info["cn"] - msr[msi].info["cn"]) < 0 - and abs(e[0].v1.pos - msr[msi].end) < self.max_insert + ms_window_size1 - ] - else: - msve = [ - e - for e in elist - if e[0].v1.strand * (msr[msi + 1].info["cn"] - msr[msi].info["cn"]) < 0 - and abs(e[0].v1.pos - msr[msi].end) < self.max_insert + ms_window_size0 - ] - if len(msve) > 0: - msve_match[msv] = msve - msv_nocover = [msv for msv in ms_vlist if msv not in msve_match] - all_msv_nocover.append(msv_nocover) - # logging.debug("Meanshift no cover", str(i), msv_nocover) - - # setup graph for flow optimization - ngvlist_full = [] - elist_full = [] - ms_addlist = [] - kce = defaultdict(lambda: 0) # number of concordant reads - koe = defaultdict(lambda: 0.0) # number of reads mapping outside the interval - kbpe = defaultdict(lambda: 0.0) # number of discordant reads across breakpoint edge - new_graph = breakpoint_graph() - s = new_graph.new_vertex(ilist[0].chrom, -1, -1) - for i, msr, ms_vlist, msv_nocover in zip(ilist, msrlist, all_msv, all_msv_nocover): - ngvlist = [] - elist = [] - for e in eilist: - if hg.interval(e[0].v1.chrom, e[0].v1.pos, e[0].v1.pos).intersects(i): - elist.append(e) - - # add vertices to new_graph - ei = 0 - nei = 0 - msi = 0 - if len(elist) == 0 or elist[ei][0].v1.strand == 1 or elist[ei][0].v1.pos > i.start: - if len(msv_nocover) == 0 or msv_nocover[msi].strand == 1 or msv_nocover[msi].pos > i.start: - nv = new_graph.new_vertex(i.chrom, i.start, -1) - ne = new_graph.new_edge(s, nv) - koe[ne] = len(self.interval_crossing_arcs(i.chrom, i.start, i.start + self.max_insert, -1, ilist)) - else: # len(ms_vlist) > 0 and ms_vlist[0].strand == -1 and ms_vlist[0].pos > i.start + self.max_insert - nv = new_graph.new_vertex(i.chrom, msv_nocover[msi].pos, msv_nocover[msi].strand) - ne = new_graph.new_edge(s, nv) - koe[ne] = len( - self.interval_crossing_arcs( - i.chrom, msv_nocover[msi].pos, msv_nocover[msi].strand + self.max_insert, -1, ilist - ) - ) - ms_addlist.append(msv_nocover[msi]) - msi += 1 - else: - nv = new_graph.new_vertex(i.chrom, elist[0][0].v1.pos, -1) - oecount = len(self.interval_crossing_arcs(nv.chrom, nv.pos, nv.pos + self.max_insert, -1, ilist)) - if oecount >= ( - self.pair_support - if not adaptive_counts - else self.pair_support_count(nv.chrom, nv.pos, -1, meanshift=msrlist, sensitivems=False) - ): - ne = new_graph.new_edge(s, nv) - koe[ne] = len(self.interval_crossing_arcs(nv.chrom, nv.pos, nv.pos + self.max_insert, -1, ilist)) - ei += 1 - ngvlist.append(nv) - vc = breakpoint_vertex(ngvlist[0].chrom, ngvlist[0].pos, ngvlist[0].strand) - while ei < len(elist) or msi < len(msv_nocover): - vp = vc - vc_type = "edge" - if msi >= len(msv_nocover): - vc = elist[ei][0].v1 - ei += 1 - elif ei >= len(elist): - vc = msv_nocover[msi] - vc_type = "meanshift" - ms_addlist.append(msv_nocover[msi]) - msi += 1 - elif elist[ei][0].v1.pos < msv_nocover[msi].pos: - vc = elist[ei][0].v1 - ei += 1 - elif elist[ei][0].v1.pos == msv_nocover[msi].pos and elist[ei][0].v1.strand < msv_nocover[msi].strand: - vc = elist[ei][0].v1 - ei += 1 - else: - vc = msv_nocover[msi] - vc_type = "meanshift" - ms_addlist.append(msv_nocover[msi]) - msi += 1 - if (vc.pos == vp.pos and vc.strand <= vp.strand) or (vc.pos == vp.pos + 1 and vc.strand < vp.strand): - continue - logging.debug( - "#TIME " + "%.3f\t" % (time() - TSTART) + "interval_filter vertices new: " + str(vc) + " " + vc_type - ) - if vc.strand == 1: - if ngvlist[nei].strand == 1: - nvc_prime = new_graph.new_vertex(ngvlist[nei].chrom, ngvlist[nei].pos + 1, -1) - # oecount = len(self.interval_crossing_arcs(nvc_prime.chrom, nvc_prime.pos, nvc_prime.pos + self.max_insert, -1, ilist)) - # if oecount >= (self.pair_support if not adaptive_counts else self.pair_support_count(nvc_prime.chrom, nvc_prime.pos, -1, meanshift=zip(ilist, msrlist, cnlist), sensitivems=False)): - # ne = new_graph.new_edge(s, nvc_prime) - # koe[ne] = oecount - ce = self.concordant_edge(vp) - nce = new_graph.new_edge(ngvlist[nei], nvc_prime) - kce[nce] = len(ce[1]) - ngvlist.append(nvc_prime) - nei += 1 - nv = new_graph.new_vertex(vc.chrom, vc.pos, 1) - if vc_type == "meanshift": - oecount = len( - self.interval_crossing_arcs( - nv.chrom, max(0, nv.pos - 2 * self.max_insert), nv.pos + 2 * self.max_insert, 1, ilist - ) - ) - else: - oecount = len( - self.interval_crossing_arcs(nv.chrom, max(0, nv.pos - self.max_insert), nv.pos, 1, ilist) - ) - # if vc_type == 'meanshift' or oecount >= (self.pair_support if not adaptive_counts else self.pair_support_count(nv.chrom, nv.pos, 1, meanshift=zip(ilist, msrlist, cnlist))): - if vc_type == "meanshift": - ne = new_graph.new_edge(s, nv) - koe[ne] = oecount - ngvlist.append(nv) - nei += 1 - else: - logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + "interval_filter vertices: adding reverse edge = " - + str(vc) - ) - if ngvlist[nei].strand == 1 and not ( - ngvlist[nei].chrom == vc.chrom and ngvlist[nei].pos == vc.pos - 1 - ): - nvc_prime = new_graph.new_vertex(ngvlist[nei].chrom, ngvlist[nei].pos + 1, -1) - oecount = len( - self.interval_crossing_arcs( - nvc_prime.chrom, nvc_prime.pos, nvc_prime.pos + self.max_insert, -1, ilist - ) - ) - # if oecount >= (self.pair_support if not adaptive_counts else self.pair_support_count(nvc_prime.chrom, nvc_prime.pos, -1, meanshift=zip(ilist, msrlist, cnlist), sensitivems=False)): - # ne = new_graph.new_edge(s, nvc_prime) - # koe[ne] = oecount - ce = self.concordant_edge(vp) - nce = new_graph.new_edge(ngvlist[nei], nvc_prime) - kce[nce] = len(ce[1]) - ngvlist.append(nvc_prime) - nei += 1 - if ngvlist[nei].strand == -1: - nvc_prime = new_graph.new_vertex(vc.chrom, vc.pos - 1, 1) - oecount = len( - self.interval_crossing_arcs( - nvc_prime.chrom, max(0, nvc_prime.pos - self.max_insert), nvc_prime.pos, 1, ilist - ) - ) - # if oecount >= (self.pair_support if not adaptive_counts else self.pair_support_count(nvc_prime.chrom, nvc_prime.pos, 1, meanshift=zip(ilist, msrlist, cnlist), sensitivems=False)): - # ne = new_graph.new_edge(s, nvc_prime) - # koe[ne] = oecount - ngvlist.append(nvc_prime) - nei += 1 - nv = new_graph.new_vertex(vc.chrom, vc.pos, -1) - if vc_type == "meanshift": - oecount = len( - self.interval_crossing_arcs( - nv.chrom, max(0, nv.pos - 2 * self.max_insert), nv.pos + 2 * self.max_insert, -1, ilist - ) - ) - else: - oecount = len( - self.interval_crossing_arcs(nv.chrom, nv.pos, nv.pos + self.max_insert, -1, ilist) - ) - # if vc_type == 'meanshift' or oecount >= (self.pair_support if not adaptive_counts else self.pair_support_count(nv.chrom, nv.pos, -1, meanshift=zip(ilist, msrlist, cnlist), sensitivems=False)): - if vc_type == "meanshift": - ne = new_graph.new_edge(s, nv) - koe[ne] = oecount - ce = self.concordant_edge(vc) - nce = new_graph.new_edge(ngvlist[nei], nv) - kce[nce] = len(ce[1]) - ngvlist.append(nv) - nei += 1 - # ei += 1 - if ngvlist[nei].strand == -1: - nv = new_graph.new_vertex(i.chrom, i.end, 1) - ne = new_graph.new_edge(s, nv) - koe[ne] = len(self.interval_crossing_arcs(nv.chrom, nv.pos - self.max_insert, nv.pos, 1, ilist)) - ngvlist.append(nv) - nei += 1 - elif ngvlist[nei].strand == 1 and ngvlist[nei].pos < i.end: - nvc_prime = new_graph.new_vertex(ngvlist[nei].chrom, ngvlist[nei].pos + 1, -1) - oecount = len( - self.interval_crossing_arcs( - nvc_prime.chrom, - nvc_prime.pos, - min(hg.chrLen[hg.chrNum(nvc_prime.chrom)], nvc_prime.pos + self.max_insert), - -1, - ilist, - ) - ) - if oecount >= ( - self.pair_support - if not adaptive_counts - else self.pair_support_count( - nvc_prime.chrom, nvc_prime.pos, -1, meanshift=msrlist, sensitivems=False - ) - ): - ne = new_graph.new_edge(s, nvc_prime) - koe[ne] = oecount - ce = self.concordant_edge(vp) - nce = new_graph.new_edge(ngvlist[nei], nvc_prime) - kce[nce] = len(ce[1]) - ngvlist.append(nvc_prime) - nei += 1 - nv = new_graph.new_vertex(i.chrom, i.end, 1) - ne = new_graph.new_edge(s, nv) - koe[ne] = len(self.interval_crossing_arcs(nv.chrom, nv.pos - self.max_insert, nv.pos, 1, ilist)) - ngvlist.append(nv) - nei += 1 - ngvlist_full = ngvlist_full + ngvlist - elist_full = elist_full + elist - print("MSstats", len(ms_vlist), len(ms_addlist)) - for msa in ms_vlist: - print( - "MSadd", - str(msa), - msv_diff[msa], - self.foldup_count(msa.chrom, msa.pos, msa.strand), - msa in ms_addlist, - ) # , self.pair_support_count(msa.chrom, msa.pos, msa.strand, ms, True) - for e0 in elist_full: - e = e0[0] - if len(ilist.intersection([hg.interval(e.v2.chrom, e.v2.pos, e.v2.pos)])) > 0 and e.v1.pos >= e.v2.pos: - ne = new_graph.add_edge(e) - logging.debug( - "#TIME " + "%.3f\t" % (time() - TSTART) + "interval_filter vertices: added edge e = " + str(e) - ) - logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + "interval_filter vertices: added edge ne = " - + str(ne) - + " " - + ne.edge_type - ) - logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + "interval_filter vertices: added edge ne, v1.elist = " - + str(ne.v1) - + " " - + ",".join(map(str, ne.v1.elist)) - ) - logging.debug( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + "interval_filter vertices: added edge ne, v2.elist = " - + str(ne.v2) - + " " - + ",".join(map(str, ne.v2.elist)) - ) - if ne is None: - raise ValueError( - "ne is None:" - + str(e) - + " " - + str(len(e0[1])) - + "\n" - + ",".join(map(str, new_graph.vs.values())) - ) - kbpe[ne] = e0[1] - elif len(ilist.intersection([hg.interval(e.v2.chrom, e.v2.pos, e.v2.pos)])) == 0: - ne = new_graph.add_edge(breakpoint_edge(breakpoint_vertex(s.chrom, s.pos, s.strand), e.v1)) - koe[ne] = e0[1] - for nei in range(1, len(ngvlist_full)): - if ngvlist_full[nei].strand == 1: - new_graph.new_edge(ngvlist_full[nei - 1], ngvlist_full[nei], edge_type="sequence") - # else: - # new_graph.new_edge(ngvlist[nei-1], ngvlist[nei]) - for e in koe: - koe[e] = max(0.0001, koe[e]) - # set up all constants - logging.info("#TIME " + "%.3f\t" % (time() - self.tstart) + " Optimizing graph copy number flow") - C = self.median_coverage()[0] / 2 - print("C (haploid coverage) = ", C) - # G = new_graph - - seqlist = [e for e in new_graph.es.values() if e.edge_type == "sequence"] - n = len(seqlist) - l = [abs(e.v2.pos - e.v1.pos) + 1 for e in seqlist] - k = [len([a for a in self.fetch(e.v1.chrom, e.v1.pos, e.v2.pos)]) for e in seqlist] - # kgcc = [self.interval_coverage(hg.interval(i.chrom, e.v1.pos, e.v2.pos), gcc=True) * (e.v2.pos - e.v1.pos) / self.read_length for e in seqlist] - # k = kgcc - # kcc = [self.interval_coverage(hg.interval(e.v1.chrom, e.v1.pos, e.v2.pos)) * (e.v2.pos - e.v1.pos) for e in seqlist] - ke = {} - ke.update(kbpe) - ke.update(kce) - ke.update(koe) - K = [ - len([a for a in self.fetch(e.v1.chrom, e.v1.pos, e.v2.pos)]) - * self.read_length - / (abs(e.v2.pos - e.v1.pos) + 1.0) - for e in seqlist - ] - # edge read count kbpe defined above - bplist = [e for e in new_graph.es.values() if (e.edge_type == "discordant" or e.edge_type == "breakpoint")] - m = len(bplist) - bpdict = {bplist[bpi]: bpi for bpi in range(len(bplist))} - print( - "########## len bplist", - len(bplist), - "; ################ kbpe, kce, koe = ", - len(kbpe), - len(kce), - len(koe), - ) - - # set up problem size and coefficients - - asub = [] - aval = [] - for i in range(n): - subarr = [i] - valarr = [1.0] - for e in seqlist[i].v1.elist: - if e.edge_type == "sequence": - continue - if n + bpdict[e] in subarr: - j = subarr.index(n + bpdict[e]) - valarr[j] += -1.0 - else: - subarr.append(n + bpdict[e]) - valarr.append(-1.0) - asub.append(subarr) - aval.append(valarr) - subarr = [i] - valarr = [1.0] - for e in seqlist[i].v2.elist: - if e.edge_type == "sequence": - continue - if n + bpdict[e] in subarr: - j = subarr.index(n + bpdict[e]) - valarr[j] += -1.0 - else: - subarr.append(n + bpdict[e]) - valarr.append(-1.0) - asub.append(subarr) - aval.append(valarr) - - coeff_f = [-1 * ki for ki in k] + [-1 * ke[e] for e in bplist] - coeff_g = [C * li / self.read_length for li in l] + [ - (self.max_insert) * C / 2 / self.read_length for e in bplist - ] - const_h = [0.0001] * (n + m) - coeff_c = [C * li / self.read_length for li in l] + [ - (self.max_insert) * C / 2 / self.read_length for e in bplist - ] - - # Solve the optimization problem - res = mosek_solver.call_mosek(n, m, asub, aval, coeff_c, coeff_f, coeff_g, const_h) - - wehc = {} - for msv_ilist in zip(all_msv, ilist): - slist = hg.interval_list( - [ - hg.interval("\t".join(map(str, [sq[0].v1.chrom, sq[0].v1.pos, sq[0].v2.pos, sq[1]]))) - for sq in zip(seqlist, res) - ] - ) - slist.sort() - msl = [msv_ilist[1].start] + [v.pos for v in msv_ilist[0]] + [msv_ilist[1].end] - mslist = hg.interval_list( - [hg.interval(msv_ilist[1].chrom, msl[i], msl[i + 1]) for i in range(len(msl) - 1)] - ) - for msi in mslist: - if len(hg.interval_list([msi]).intersection(slist)) == 0: - print("MSnointersection", str(msi), msl) - for s in slist: - print(str(s)) - print("=============================") - for s in seqlist: - print(str(s)) - exit() - elif sum([ap[0].intersection(ap[1]).size() for ap in hg.interval_list([msi]).intersection(slist)]) == 0: - print("MS0intersection", str(msi)) - exit() - - edge_code = defaultdict(lambda: "discordant", {"concordant": "concordant", "source": "source"}) - - graph_logger.info( - "SequenceEdge: StartPosition, EndPosition, PredictedCopyCount, AverageCoverage, Size, NumberReadsMapped" - ) - for si in range(n): - graph_logger.info( - "sequence\t" - + "\t".join( - map( - str, - [ - seqlist[si].v1, - seqlist[si].v2, - res[si], - K[si], - seqlist[si].v2.pos - seqlist[si].v1.pos, - k[si], - ], - ) - ) - ) - wehc[seqlist[si]] = float(res[si]) - graph_logger.info( - "BreakpointEdge: StartPosition->EndPosition, PredictedCopyCount, NumberOfReadPairs, HomologySizeIfAvailable(<0ForInsertions), Homology/InsertionSequence" - ) - for bpi in range(m): - # print edge_code[bplist[bpi].type()], str(bplist[bpi]), res[n + bpi], ke[bplist[bpi]], bplist[bpi].kmer_homology() - graph_logger.info( - "\t".join( - map( - str, - [ - edge_code[bplist[bpi].type()], - bplist[bpi], - res[n + bpi], - ke[bplist[bpi]], - bplist[bpi].hom, - bplist[bpi].hom_seq, - ], - ) - ) - ) - wehc[bplist[bpi]] = float(res[n + bpi]) - lenlist = len(ilist) - if len(ilist0) >= 10: - lenlist = len(ilist0) - all_msv_cat = reduce(lambda x, y: x + y, all_msv, []) - oncolist = ",".join(set([a[1].info["Name"] for a in ilist.intersection(hg.oncogene_list)])) + "," - istr = ",".join([i.chrom + ":" + str(i.start) + "-" + str(i.end) for i in ilist]) - summary_logger.info("TotalIntervalSize = " + str(sum([a.size() for a in ilist]))) - summary_logger.info( - "AmplifiedIntervalSize = " - + str(sum([seqlist[si].v2.pos - seqlist[si].v1.pos for si in range(n) if res[si] >= 2.5])) - ) - if len([seqlist[si].v2.pos - seqlist[si].v1.pos for si in range(n) if res[si] >= 2.5]) > 0: - summary_logger.info( - "AverageAmplifiedCopyCount = " - + str( - sum([res[si] * (seqlist[si].v2.pos - seqlist[si].v1.pos) for si in range(n) if res[si] >= 2.5]) - / sum([seqlist[si].v2.pos - seqlist[si].v1.pos + 1 for si in range(n) if res[si] >= 2.5]) - ) - ) - else: - summary_logger.info("AverageAmplifiedCopyCount = 2") - summary_logger.info("#Chromosomes = " + str(len(set([i.chrom for i in ilist])))) - summary_logger.info("#SeqenceEdges = " + str(n)) - summary_logger.info("#BreakpointEdges = " + str(len(kbpe))) - summary_logger.info("#CoverageShifts = " + str(len(all_msv_cat))) - summary_logger.info("#MeanshiftSegmentsCopyCount>5 = " + str(len([v for v in msv_diff.values() if v > 5]))) - summary_logger.info( - "#Foldbacks = " - + str(len([msa for msa in all_msv_cat if self.foldup_count(msa.chrom, msa.pos, msa.strand) >= 1])) - ) - summary_logger.info( - "#CoverageShiftsWithBreakpointEdges = " + str(len([msa for msa in all_msv_cat if msa in ms_addlist])) - ) - - # Summary, #intervals, t talsize, size>2.5, AvgCoverage>2.5, #chromosomes, #sequenceedges, #breakpointedges, #meanshiftbreaks, #meanshift>5, #msfoldbackedges, #msfoldbackedges, #mswithoutbreakpoint, oncogenes, representativestring, #bpedgeswithcommonkmers - if len([seqlist[si].v2.pos - seqlist[si].v1.pos for si in range(n) if res[si] >= 2.5]) > 0: - # print '\t'.join(map(str, ["Summary:", lenlist, sum([a.size() for a in ilist]), sum([seqlist[si].v2.pos - seqlist[si].v1.pos for si in range(n) if res[si] >= 2.5]), sum([res[si] * (seqlist[si].v2.pos - seqlist[si].v1.pos) for si in range(n) if res[si] >= 2.5]) / sum([seqlist[si].v2.pos - seqlist[si].v1.pos for si in range(n) if res[si] >= 2.5]), len(Set([i.chrom for i in ilist])), n, len(kbpe), len(all_msv_cat), len([v for v in msv_diff.values() if v > 5]), len([msa for msa in all_msv_cat if self.foldup_count(msa.chrom, msa.pos, msa.strand) >= 1]), len([msa for msa in all_msv_cat if self.foldup_count(msa.chrom, msa.pos, msa.strand) >= 1]), len([msa for msa in all_msv_cat if msa in ms_addlist]), oncolist, istr, len([e for e in kbpe if e.kmer_homology()])])) - print( - "\t".join( - map( - str, - [ - "Summary:", - lenlist, - sum([a.size() for a in ilist]), - sum([seqlist[si].v2.pos - seqlist[si].v1.pos + 1 for si in range(n) if res[si] >= 2.5]), - sum( - [ - res[si] * (seqlist[si].v2.pos - seqlist[si].v1.pos + 1) - for si in range(n) - if res[si] >= 2.5 - ] - ) - / sum([seqlist[si].v2.pos - seqlist[si].v1.pos + 1 for si in range(n) if res[si] >= 2.5]), - len(set([i.chrom for i in ilist])), - n, - len(kbpe), - len(all_msv_cat), - len([v for v in msv_diff.values() if v > 5]), - len([msa for msa in all_msv_cat if self.foldup_count(msa.chrom, msa.pos, msa.strand) >= 1]), - len([msa for msa in all_msv_cat if self.foldup_count(msa.chrom, msa.pos, msa.strand) >= 1]), - len([msa for msa in all_msv_cat if msa in ms_addlist]), - oncolist, - istr, - ], - ) - ) - ) - for i in ilist: - if ( - sum( - [ - seqlist[si].v2.pos - seqlist[si].v1.pos + 1 - for si in range(n) - if res[si] >= 5 - and hg.interval(seqlist[si].v1.chrom, seqlist[si].v1.pos, seqlist[si].v2.pos).intersects(i) - ] - ) - == 0 - ): - print("IntervalAmplifiedSize: ", i.chrom, i.start, i.end, 0, 2) - continue - print( - "IntervalAmplifiedSize: ", - i.chrom, - i.start, - i.end, - sum( - [ - seqlist[si].v2.pos - seqlist[si].v1.pos + 1 - for si in range(n) - if res[si] >= 5 - and hg.interval(seqlist[si].v1.chrom, seqlist[si].v1.pos, seqlist[si].v2.pos).intersects(i) - ] - ), - sum( - [ - res[si] * (seqlist[si].v2.pos - seqlist[si].v1.pos + 1) - for si in range(n) - if res[si] >= 5 - and hg.interval(seqlist[si].v1.chrom, seqlist[si].v1.pos, seqlist[si].v2.pos).intersects(i) - ] - ) - / sum( - [ - seqlist[si].v2.pos - seqlist[si].v1.pos + 1 - for si in range(n) - if res[si] >= 5 - and hg.interval(seqlist[si].v1.chrom, seqlist[si].v1.pos, seqlist[si].v2.pos).intersects(i) - ] - ), - ) - else: - # print '\t'.join(map(str, ["Summary:", lenlist, sum([a.size() for a in ilist]), sum([seqlist[si].v2.pos - seqlist[si].v1.pos for si in range(n) if res[si] >= 2.5]), sum([res[si] * (seqlist[si].v2.pos - seqlist[si].v1.pos) for si in range(n)]) / sum([seqlist[si].v2.pos - seqlist[si].v1.pos for si in range(n)]), len(Set([i.chrom for i in ilist])), n, len(kbpe), len(all_msv_cat), len([v for v in msv_diff.values() if v > 5]), len([msa for msa in all_msv_cat if self.foldup_count(msa.chrom, msa.pos, msa.strand) >= 1]), len([msa for msa in all_msv_cat if self.foldup_count(msa.chrom, msa.pos, msa.strand) >= 1]), len([msa for msa in all_msv_cat if msa in ms_addlist]), oncolist, istr, len([e for e in kbpe if e.kmer_homology()])])) - print( - "\t".join( - map( - str, - [ - "Summary:", - lenlist, - sum([a.size() for a in ilist]), - sum([seqlist[si].v2.pos - seqlist[si].v1.pos + 1 for si in range(n) if res[si] >= 2.5]), - sum([res[si] * (seqlist[si].v2.pos - seqlist[si].v1.pos + 1) for si in range(n)]) - / sum([seqlist[si].v2.pos - seqlist[si].v1.pos + 1 for si in range(n)]), - len(set([i.chrom for i in ilist])), - n, - len(kbpe), - len(all_msv_cat), - len([v for v in msv_diff.values() if v > 5]), - len([msa for msa in all_msv_cat if self.foldup_count(msa.chrom, msa.pos, msa.strand) >= 1]), - len([msa for msa in all_msv_cat if self.foldup_count(msa.chrom, msa.pos, msa.strand) >= 1]), - len([msa for msa in all_msv_cat if msa in ms_addlist]), - oncolist, - istr, - ], - ) - ) - ) - - if runmode == "BPGRAPH": - return - logging.info("#TIME " + "%.3f\t" % (time() - self.tstart) + " Plotting SV View") - - interval_index = 1 - for i in ilist: - cycle_logger.info("Interval\t" + "\t".join([str(interval_index), i.chrom, str(i.start), str(i.end)])) - interval_index += 1 - - new_graph.cycle_decomposition(wehc, s) - - # Plot coverage, meanshift copy count estimates and discordant edges in interval - def plot_segmentation(self, ilist, amplicon_name, segments=[], scale_list=[], eilist=None, font="small"): - fighsize = 12 - figvsize = 5 - if font == "large": - matplotlib.rcParams.update({"font.size": 18}) - figvsize = 5.85 - if font == "all_amplicons": - matplotlib.rcParams.update({"font.size": 48}) - figvsize = 5.21 - fighsize = 24 - fig = plt.figure(figsize=(fighsize, figvsize)) - plt.subplots_adjust(left=73 / 1000.0, right=1 - 73 / 1000.0, bottom=1 / 4.0, top=1 - 1 / 10.0) - # dpi = 300 - if font == "large": - plt.subplots_adjust(left=73 / 1000.0, right=1 - 73 / 1000.0, bottom=2.1 / 5.85, top=90 / 100.0) - if font == "all_amplicons": - plt.subplots_adjust(left=73 / 1000.0, right=1 - 73 / 1000.0, bottom=1 / 5.21, top=95 / 100.0) - - dpi = 1000.0 / fighsize - gs = gridspec.GridSpec(2, 1, height_ratios=[8, 2]) - if font == "all_amplicons": - gs = gridspec.GridSpec(2, 1, height_ratios=[5, 4]) - ax = fig.add_subplot(gs[0, 0]) - if font == "large": - plt.title(os.path.basename(amplicon_name), fontsize=28) - elif font != "all_amplicons": - plt.title(os.path.basename(amplicon_name)) - # if font == 'all_amplicons': - # plt.title(os.path.basename(amplicon_name), fontsize=56) - ax2 = ax.twinx() - ax2.set_ylabel("CN") - ax3 = fig.add_subplot(gs[1, 0], sharex=ax) - ax.set_xlim(0, 1) - ax.set_ylabel("Coverage") - ax.yaxis.set_label_coords(-0.05, 0.25) - ax2.yaxis.set_label_coords(1.05, 0.33) - if font == "all_amplicons": - ax.set_ylabel("") - ax2.set_ylabel("") - for b in ilist.offset_breaks(): - ax.axvline(b[0], linestyle=b[1], color="k") - ax3.axvline(b[0], linestyle=b[1], color="k") - - cx = [] - wc = [] - - elist_dict = {} - max_edge = 4 - scale_max_cov = 0 - scale_max_ms = 0 - # msrlist = [self.get_meanshift(i) if i.size() > 50000 else self.meanshift_segmentation(i, window_size=300) for i in ilist] - msrlist = [ - self.get_meanshift(i) if i.size() > 50000 else self.get_meanshift(i, window_size0=300) for i in ilist - ] - sensitive_elist = self.get_sensitive_discordant_edges( - ilist, - msrlist, - eilist, - ms_window_size0=10000, - ms_window_size1=300, - adaptive_counts=True, - amplicon_name=amplicon_name, - ) - eilist = sensitive_elist - - for i, msr in zip(ilist, msrlist): - de = [ - e - for e in eilist - if e[0].v1.pos != -1 and hg.interval(e[0].v1.chrom, e[0].v1.pos, e[0].v1.pos).intersects(i) - ] # self.interval_discordant_edges(i) - elist_dict[i] = de - elist_dict[i].sort(key=lambda x: hg.absPos(x[0].v1.chrom, x[0].v1.pos) + 0.1 * x[0].v1.strand) - for e in eilist: - eposlist = [] - if e[0].v1.pos != -1: - eposlist.append(hg.interval(e[0].v1.chrom, e[0].v1.pos, e[0].v1.pos)) - if e[0].v2.pos != -1: - eposlist.append(hg.interval(e[0].v2.chrom, e[0].v2.pos, e[0].v2.pos)) - if len(scale_list) == 0 or len(hg.interval_list(eposlist).intersection(scale_list)) > 0: - max_edge = max(max_edge, e[1]) - - for i in ilist: - if i.size() > 1000000: - wc_i = [w for w in self.window_coverage(i, 10000, exact=False)] - - elif i.size() > 100000: - wc_i = [w for w in self.window_coverage(i, 1000, exact=False)] - - else: - wc_i = [w for w in self.window_coverage(i, 150, exact=False)] - - cx += [((i.chrom, (c[0].start + c[0].end) / 2), c[1]) for c in wc_i] - wc += wc_i - - cx0 = [c for c in cx if ilist.xpos(c[0][0], c[0][1]) is not None] - ax.bar( - [ilist.xpos(c[0][0], c[0][1]) for c in cx0], - [c[1] for c in cx0], - 0.0001, - zorder=1, - edgecolor="0.7", - linewidth=1, - ) - # cmax = max([c[1] for c in wc]) - # logging.debug("cmax was " + str(cmax)) - - covl = [] - for i, msr in zip(ilist, msrlist): - for seg in msr: - avg_cov = np.average([c[1] for c in cx0 if c[0][0] == seg.chrom and seg.start <= c[0][1] <= seg.end]) - if len(scale_list) == 0 or len(hg.interval_list([i]).intersection(scale_list)) > 0: - covl += [c[1] for c in cx0 if c[0][0] == seg.chrom and seg.start <= c[0][1] <= seg.end] - scale_max_cov = max(scale_max_cov, avg_cov) - if seg.info["cn"] != float("inf"): - scale_max_ms = max(scale_max_ms, seg.info["cn"]) - else: - scale_max_ms = max(scale_max_ms, 2000) - - ax2.plot( - (ilist.xpos(seg.chrom, max(i.start, seg.start)), ilist.xpos(seg.chrom, min(i.end, seg.end))), - (seg.info["cn"], seg.info["cn"]), - linewidth=4, - color="k", - ) - - logging.debug("Max cov, max ms scales set to: " + str(scale_max_cov) + " " + str(scale_max_ms)) - covl.sort() - if len(covl) > 0: - m95cov = covl[-(len(covl) // 20)] - else: - m95cov = 0 - - if 0 < scale_max_cov < m95cov: - scale_max_ms = scale_max_ms * m95cov / scale_max_cov - scale_max_cov = scale_max_cov * m95cov / scale_max_cov - - y_scale = 3.0 - # y_scale = 2.5 - if font == "all_amplicons": - y_scale = 2.5 - if scale_max_cov > 0: - ax.set_ylim(0.1, y_scale * scale_max_cov) - else: - (ymin, ymax) = ax.get_ylim() - ax.set_ylim(ymin, y_scale * ymax) - if scale_max_ms > 0: - (ymin, ymax) = (0.1, scale_max_ms) - ax2.set_ylim(0.1, y_scale * ymax) - else: - (ymin, ymax) = ax2.get_ylim() - ax2.set_ylim(0.1, y_scale * ymax) - - for i in ilist: - for el in elist_dict[i]: - e = el[0] - if ilist.xpos(e.v2.chrom, e.v2.pos) is None and ilist.xpos(e.v1.chrom, e.v1.pos) is None: - continue - elif ilist.xpos(e.v2.chrom, e.v2.pos) is None: - ax2.axvline( - ilist.xpos(e.v1.chrom, e.v1.pos), - color=ecolor[e.type()], - linewidth=4.0 * min(1, float(el[1]) / max_edge), - alpha=0.5, - zorder=10, - ) - ax2.plot( - (ilist.xpos(e.v1.chrom, e.v1.pos), ilist.xpos(e.v1.chrom, e.v1.pos) - 0.01 * e.v1.strand), - (0, 0), - linewidth=8.0 * min(1, float(el[1]) / max_edge), - color=ecolor[e.type()], - ) - elif ilist.xpos(e.v1.chrom, e.v1.pos) is None: - ax2.axvline( - ilist.xpos(e.v2.chrom, e.v2.pos), - color=ecolor[e.type()], - linewidth=4.0 * min(1, float(el[1]) / max_edge), - alpha=0.5, - zorder=10, - ) - ax2.plot( - (ilist.xpos(e.v2.chrom, e.v2.pos), ilist.xpos(e.v2.chrom, e.v2.pos) - 0.01 * e.v2.strand), - (0, 0), - linewidth=8.0 * min(1, float(el[1]) / max_edge), - color=ecolor[e.type()], - ) - else: - xmid = (ilist.xpos(e.v1.chrom, e.v1.pos) + ilist.xpos(e.v2.chrom, e.v2.pos)) / 2 - xdia = abs(ilist.xpos(e.v2.chrom, e.v2.pos) - ilist.xpos(e.v1.chrom, e.v1.pos)) - ydia = (1.0 + xdia) * 3 * ymax - pseudo_edge = breakpoint_edge( - breakpoint_vertex(e.v1.chrom, hg.absPos(e.v1.chrom, e.v1.pos), e.v1.strand), - breakpoint_vertex(e.v1.chrom, hg.absPos(e.v2.chrom, e.v2.pos), e.v2.strand), - ) - ee = Arc( - (xmid, 0), - xdia, - ydia, - fill=False, - linewidth=4.0 * min(1, float(el[1]) / max_edge), - color=ecolor[pseudo_edge.type()], - zorder=4, - theta1=0.1, - theta2=180, - ) - ax2.add_patch(ee) - ax2.plot( - (ilist.xpos(e.v1.chrom, e.v1.pos), ilist.xpos(e.v1.chrom, e.v1.pos) - 0.01 * e.v1.strand), - (0, 0), - linewidth=8.0 * min(1, float(el[1]) / max_edge), - color=ecolor[pseudo_edge.type()], - ) - ax2.plot( - (ilist.xpos(e.v2.chrom, e.v2.pos), ilist.xpos(e.v2.chrom, e.v2.pos) - 0.01 * e.v2.strand), - (0, 0), - linewidth=8.0 * min(1, float(el[1]) / max_edge), - color=ecolor[pseudo_edge.type()], - ) - ax2.axhline(2.0, alpha=0.8, linewidth=0.5, color="r") - - gparity = 0 - ry = 0.60 - ty = 0.65 - ogene_width = 4 - if font == "large": - # ry = 0.85 - # ry = 0.87 - ry = 0.77 - ogene_width = 12 - ogene_plotted = [] - for i in ilist: - glist = hg.interval_list([i]).intersection(hg.oncogene_list) - ogene_plotted += [g[1].info["Name"] for g in glist] - for g in glist: - if font == "large": - ty = 0 - elif font == "all_amplicons": - if gparity == 0: - # ty = -0.1 - ty = -0.07 - else: - # ty = 0.20 - ty = 0.3 - else: - if gparity == 0: - ty = 0 - else: - ty = 0.37 - if font == "large": - ax3.plot( - [ilist.xpos(i.chrom, max(g[1].start, i.start)), ilist.xpos(i.chrom, min(g[1].end, i.end))], - [ry, ry], - "r-", - linewidth=ogene_width, - ) - ax3.text( - (ilist.xpos(i.chrom, max(g[1].start, i.start)) + ilist.xpos(i.chrom, min(g[1].end, i.end))) - / 2.0, - ty, - g[1].info["Name"], - horizontalalignment="center", - verticalalignment="bottom", - fontsize=28, - zorder=4, - ) - elif font == "all_amplicons": - ogene_width = 36 - ax3.plot( - [ilist.xpos(i.chrom, max(g[1].start, i.start)), ilist.xpos(i.chrom, min(g[1].end, i.end))], - [0.85, 0.85], - "r-", - linewidth=ogene_width, - ) - ax3.text( - (ilist.xpos(i.chrom, max(g[1].start, i.start)) + ilist.xpos(i.chrom, min(g[1].end, i.end))) - / 2.0, - -0.05 + 0.37 * gparity, - g[1].info["Name"], - horizontalalignment="center", - verticalalignment="bottom", - fontsize=48, - zorder=4, - ) - else: - ax3.plot( - [ilist.xpos(i.chrom, max(g[1].start, i.start)), ilist.xpos(i.chrom, min(g[1].end, i.end))], - [ry, ry], - "r-", - linewidth=ogene_width, - ) - ax3.text( - (ilist.xpos(i.chrom, max(g[1].start, i.start)) + ilist.xpos(i.chrom, min(g[1].end, i.end))) - / 2.0, - ty, - g[1].info["Name"], - horizontalalignment="center", - verticalalignment="bottom", - ) - gparity = (gparity + 1) % 2 - for s in segments: - if not i.intersects(s): - continue - ss = i.intersection(s) - ax3.add_patch( - Rectangle( - [ilist.xpos(i.chrom, max(ss.start, i.start)), 0.65], - ilist.xpos(i.chrom, min(ss.end, i.end)) - ilist.xpos(i.chrom, max(ss.start, i.start)), - 0.25, - fc=chrcolor[s.info[1]], - ec="k", - ) - ) - if font == "large": - ax3.text( - (ilist.xpos(i.chrom, max(ss.start, i.start)) + ilist.xpos(i.chrom, min(ss.end, i.end))) / 2.0, - 0, - s.info[0], - horizontalalignment="center", - verticalalignment="bottom", - fontsize=28, - ) - elif font == "large" or font == "all_amplicons": - ax3.text( - (ilist.xpos(i.chrom, max(ss.start, i.start)) + ilist.xpos(i.chrom, min(ss.end, i.end))) / 2.0, - 0, - s.info[0], - horizontalalignment="center", - verticalalignment="bottom", - fontsize=48, - ) - else: - ax3.text( - (ilist.xpos(i.chrom, max(ss.start, i.start)) + ilist.xpos(i.chrom, min(ss.end, i.end))) / 2.0, - 0.2 + int(s[0]) % 2 * 0.15, - s.info[0], - horizontalalignment="center", - verticalalignment="top", - ) - # ax3.text((xpos(max(s[1].start, i.start)) + xpos(min(s[1].end, i.end)))/2.0, 0.2+0%2*0.15, s[0], horizontalalignment='center', verticalalignment='top') - - if font == "large" or font == "all_amplicons": - axyticks = ax.get_yticks() - ax.set_yticks([0, axyticks[-1]]) - ax2yticks = ax2.get_yticks() - ax2.set_yticks([0, ax2yticks[-1]]) - - ax.xaxis.set_visible(False) - ax2.xaxis.set_visible(False) - ax3.yaxis.set_visible(False) - ax.spines["left"].set_visible(False) - ax.spines["right"].set_visible(False) - ax.spines["top"].set_visible(False) - ax2.spines["left"].set_visible(False) - ax2.spines["right"].set_visible(False) - ax2.spines["top"].set_visible(False) - ax3.spines["left"].set_visible(False) - ax3.spines["right"].set_visible(False) - ax2.spines["bottom"].set_linewidth(4) - ax3.tick_params("x", length=0, which="major") - ax3.tick_params("x", length=5, which="minor") - if font == "all_amplicons": - previous_chrom = "" - chrom_index = 1 - interval_poslist = [] - for i in ilist: - if i.chrom != previous_chrom: - chrom_index = 1 - else: - chrom_index += 1 - previous_chrom = i.chrom - imin = ilist.xpos(i.chrom, i.start) - imax = ilist.xpos(i.chrom, i.end) - segname = "" - if imax - imin > 0.2: - segname = i.chrom + "." + str(chrom_index) - elif imax - imin > 0.05: - segname = i.chrom.strip("chr") + "." + str(chrom_index) - elif imax - imin > 0.02: - segname = i.chrom.strip("chr") - interval_poslist.append((segname, (imax + imin) / 2)) - ax3.xaxis.set_major_locator(ticker.FixedLocator([c[1] for c in interval_poslist])) - ax3.xaxis.set_major_formatter(ticker.FixedFormatter([c[0] for c in interval_poslist])) - else: - chrmin = {} - chrmax = {} - for i in ilist: - if i.chrom not in chrmin: - chrmin[i.chrom] = ilist.xpos(i.chrom, i.start) - chrmax[i.chrom] = ilist.xpos(i.chrom, i.end) - else: - chrmin[i.chrom] = min(ilist.xpos(i.chrom, i.start), chrmin[i.chrom]) - chrmax[i.chrom] = max(ilist.xpos(i.chrom, i.end), chrmax[i.chrom]) - chrposlist = [] - for c in chrmin: - chrposlist.append((c if chrmax[c] - chrmin[c] > 0.10 else c.strip("chr"), (chrmin[c] + chrmax[c]) / 2)) - ax3.xaxis.set_major_locator(ticker.FixedLocator([c[1] for c in chrposlist])) - ax3.xaxis.set_major_formatter(ticker.FixedFormatter([c[0] for c in chrposlist])) - xposlist = [] - if font != "all_amplicons": - for i in ilist: - xposlist.append((str(i.start), ilist.xpos(i.chrom, i.start))) - xposlist.append((str(i.end), ilist.xpos(i.chrom, i.end))) - ax3.xaxis.set_minor_locator(ticker.FixedLocator([c[1] for c in xposlist])) - ax3.xaxis.set_minor_formatter(ticker.FixedFormatter([c[0] for c in xposlist])) - plt.setp(ax3.xaxis.get_minorticklabels(), rotation=90) - ax3.tick_params(axis="x", which="minor", pad=15) - # ax3.tick_params(axis='x', which='minor', pad=-5) - ax3.yaxis.set_major_formatter(ticker.NullFormatter()) - ax3.set_ylim(0, 1) - - # ax3.spines['bottom'].set_visible(False) - # ax3.xaxis.set_visible(False) - - fig.subplots_adjust(hspace=0) - try: - fig.savefig(amplicon_name + ".png", dpi=dpi) - fig.savefig(amplicon_name + ".pdf", dpi=dpi) - except np.linalg.linalg.LinAlgError: - logging.error("Numpy LinAlgError when forming amplicon plot! Cannot save " + amplicon_name + " image\n") - - plt.close() diff --git a/bin/breakpoint_graph.py b/bin/breakpoint_graph.py deleted file mode 100755 index 57ae5d21..00000000 --- a/bin/breakpoint_graph.py +++ /dev/null @@ -1,983 +0,0 @@ -# This software is Copyright 2017 The Regents of the University of California. All Rights Reserved. Permission to copy, modify, and distribute this software and its documentation for educational, research and non-profit purposes, without fee, and without a written agreement is hereby granted, provided that the above copyright notice, this paragraph and the following three paragraphs appear in all copies. Permission to make commercial use of this software may be obtained by contacting: -# -# Office of Innovation and Commercialization -# -# University of California -# -# La Jolla, CA 92093-0910 -# -# (858) 534-5815 -# -# invent@ucsd.edu -# -# This software program and documentation are copyrighted by The Regents of the University of California. The software program and documentation are supplied "as is", without any accompanying services from The Regents. The Regents does not warrant that the operation of the program will be uninterrupted or error-free. The end-user understands that the program was developed for research purposes and is advised not to rely exclusively on the program for any reason. -# -# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. -# Author: Viraj Deshpande -# Contact: virajbdeshpande@gmail.com -# Source: https://github.com/jluebeck/AmpliconArchitect -# Commit: 2172cdfd5b2834f98f60a5ee77f282249e16f527 - -import sys - -from collections import defaultdict -import heapq -import logging - -from abstract_graph import * -import ref_util as hg - -cycle_logger = logging.getLogger("cycle") - - -class breakpoint_vertex(abstract_vertex): - """Class representing breakpoint vertex derived from abstract_graph.abstract_vertex - - Attributes: - chrom = chromosome name - pos = 1-based chromosomal location - strand = 1/-1 for forward/reverse strand - vid = (optional)id of vertex - graph = (optional) graph to which vertex belongs""" - - def __init__(self, chrom="", pos=-2, strand=1, vid=0, graph=None): - """2 ways to initialize: - 1) chrom: breakpoint_vertex string in the format chrom:pos("+"/"-"") - 2) chrom, pos, strand: name(STR), pos (INT), strand("+"/"-"")""" - if pos == -2: - vstring = chrom - chrom = vstring[: vstring.find(":")] - pos = int(vstring[vstring.find(":") + 1 : -1]) - strand = 1 if vstring[-1] == "+" else -1 - if graph is not None and graph.has_vertex(chrom, pos, strand): - raise Exception("Duplicate vertex added") - abstract_vertex.__init__(self, vid, graph) - self.chrom = chrom - self.pos = pos - self.strand = strand - - def __repr__(self): - """String format chrom:pos(+/-)""" - if self.strand == 1: - return self.chrom + ":" + str(self.pos) + "+" - else: - return self.chrom + ":" + str(self.pos) + "-" - - def __hash__(self): - return str(self).__hash__() - - def __gt__(self, y): - """Order vertices by absolute position (See hg19util.absPos) + strand""" - return hg.absPos(self.chrom, self.pos) + 0.4 * self.strand > hg.absPos(y.chrom, y.pos) + 0.4 * y.strand - - -class breakpoint_edge(abstract_edge): - """Class representing breakpoint edge derived from abstract_graph.abstract_edge - - Attributes: - v1 = breakpoint_vertex 1 of the edge (recommend using v2 > v1) - v2 = breakpoint_vertex 2 of the edge - edge_type = "discordant"/"breakpoint" or "concordant" : genomic connectivity or source; "sequence": genomic interval - eid = (optional) edge id - graph = (optional) graph to which edge belongs""" - - def __init__( - self, v1, v2=None, eid=0, graph=None, update_vertices=True, edge_type="discordant", hom=None, hom_seq=None - ): - """2 ways to initialize: - 1) v1 = breakpoint_edge string in the format breakpoint_vertex1->breakpoint_vertex2 - 2) v1,v2 = breakpoint_point_vertices - Optional: - eid: edge id - graph: breakpoint_graph - update_vertices: True if vertices edge should be added to vertex neighbor list - edge_type: " = "discordant"/"breakpoint" or "concordant" : genomic connectivity or source; "sequence": genomic interval - Required: - If edge_type = "sequence": v1.chrom = v2.chrom, v1.pos > v2.pos else if equal v1.strand > v2.strand - If edge_type = "concordant": v1.chrom = v2.chrom, |v1.pos - v2.pos| = 1 and the smaller has strand = 1 else -1 - """ - if type(v1) == str: - estr = v1 - v1 = breakpoint_vertex(estr.split(">")[0][:-1]) - v2 = breakpoint_vertex(estr.split(">")[1]) - abstract_edge.__init__(self, v1, v2, eid, graph, update_vertices) - if edge_type in ["concordant", "sequence"]: - if v1.chrom != v2.chrom: - raise Exception("Edge of type " + edge_type + " connects different chromosomes.") - if edge_type in ["concordant", "sequence"]: - if v1.strand == v2.strand: - raise Exception("Edge of type " + edge_type + " connects same strand.") - if edge_type == "concordant": - if (v1.strand == 1 and v1.pos + 1 != v2.pos) or (v2.strand == 1 and v2.pos + 1 != v1.pos): - raise Exception("Edge of type " + edge_type + " connects non-adjacent positions.") - if edge_type == "sequence": - if v1.strand == -1 and v1.pos > v2.pos: - raise Exception( - "Start position for sequence edge greater than end position:" + str(v1) + "->" + str(v2) - ) - if v1.strand == 1 and v2.pos > v1.pos: - raise Exception("Start position for sequence edge greater than end position") - self.edge_type = edge_type - self.hom = hom - self.hom_seq = hom_seq - - def sequence(self, flank_size=-1): - if self.edge_type == "sequence": - seq = hg.interval(self.v1.chrom, self.v1.pos, self.v2.pos).sequence() - if flank_size > 0: - seq = ( - hg.interval(self.v1.chrom, self.v1.pos - flank_size + 1, self.v1.pos).sequence() - + seq - + hg.interval(self.v2.chrom, self.v2.pos, self.v2.pos + flank_size - 1).sequence() - ) - else: - if self.hom == None: - seq = "N" * 20 - else: - seq = self.hom_seq - if flank_size == -1: - flank_size = 1000 - if flank_size > 0: - if self.hom is not None and self.hom > 0: - hom = self.hom - else: - hom = 0 - if self.edge_type == "source": - if self.v2.strand == -1: - right_seq = hg.interval( - self.v2.chrom, self.v2.pos + hom, self.v2.pos + hom + flank_size - 1 - ).sequence() - left_seq = "" - else: - left_seq = hg.interval( - self.v2.chrom, self.v2.pos - hom - flank_size + 1, self.v2.pos - hom - ).sequence() - right_seq = "" - elif self.v1.strand == 1: - left_seq = hg.interval( - self.v1.chrom, self.v1.pos - hom - flank_size + 1, self.v1.pos - hom - ).sequence() - if self.v2.strand == -1: - right_seq = hg.interval( - self.v2.chrom, self.v2.pos + hom, self.v2.pos + hom + flank_size - 1 - ).sequence() - else: - right_seq = hg.interval( - self.v2.chrom, self.v2.pos - hom - flank_size + 1, self.v2.pos - hom, strand=-1 - ).sequence() - else: - right_seq = hg.interval( - self.v1.chrom, self.v1.pos + hom, self.v1.pos + hom + flank_size - 1 - ).sequence() - if self.v2.strand == -1: - left_seq = hg.interval( - self.v2.chrom, self.v2.pos + hom, self.v2.pos + hom + flank_size - 1, strand=-1 - ).sequence() - else: - left_seq = hg.interval( - self.v2.chrom, self.v2.pos - hom - flank_size + 1, self.v2.pos - hom - ).sequence() - seq = left_seq + seq + right_seq - return seq - - def kmer_homology(self, k=10, span=100): - """Number of shared k-mers within "span" distance on either side of vertex positions""" - seq1 = "".join( - [ - a.capitalize() - for a in hg.interval( - self.v1.chrom, - max(1, self.v1.pos - span), - min(self.v1.pos + span, hg.chrLen[hg.chrNum(self.v1.chrom)]), - self.v1.strand, - ).sequence() - ] - ) - seq2 = "".join( - [ - a.capitalize() - for a in hg.interval( - self.v2.chrom, - max(1, self.v2.pos - span), - min(self.v2.pos + span, hg.chrLen[hg.chrNum(self.v2.chrom)]), - -1 * self.v2.strand, - ).sequence() - ] - ) - kset1 = set([seq1[i : i + 10] for i in range(len(seq1) - k + 1)]) - kset2 = set([seq2[i : i + 10] for i in range(len(seq2) - k + 1)]) - return len(kset1.intersection(kset2)) - - def type(self, min_insert=0, max_insert=500): - """Determine type of "breakpoint"/"discordant edge - Output values: - "source": Contains v.pos = -1, indicates end of linear contig. - "interchromosomal": Different chromosomes. - "everted": Forward strand of larger position connected to reverse strand of reverse, indicated by outward orientation of read-pairs, may suggest tandem duplication. - "forward": Both vertex/paired-reads map to forward strand - "reverse": Both vertex/paired-reads map to reverse strand - "discordant": Alignment distance larger/smaller than max/min insert, may indicate deletion - "concordant": Expected alignment length between min and max insert. NOTE: Different from edge_type - """ - if self.v1.pos == -1 or self.v2.pos == -1: - return "source" - elif self.v1.chrom != self.v2.chrom: - return "interchromosomal" - elif self.v1.pos <= self.v2.pos: - vmin = self.v1 - vmax = self.v2 - else: - vmin = self.v2 - vmax = self.v1 - if vmax.strand == 1 and vmin.strand == -1: - return "everted" - if vmax.pos == vmin.pos and vmax.strand != vmin.strand: - return "everted" - if vmax.strand == 1 and vmin.strand == 1: - return "forward" - if vmax.strand == -1 and vmin.strand == -1: - return "reverse" - if vmax.pos - vmin.pos > max_insert or vmax.pos - vmin.pos < min_insert: - return "discordant" - return "concordant" - - def __repr__(self): - """breakpoint_vertex1->breakpoint_vertex2""" - return str(self.v1) + "->" + str(self.v2) - - def __lt__(self, other): - return min((self.v1.chrom, self.v1.pos), (self.v2.chrom, self.v2.pos)) < min( - (other.v1.chrom, self.v1.pos), (other.v2.chrom, self.v2.pos) - ) - - -class breakpoint_graph(abstract_graph): - """Class representing breakpoint edge derived from abstract_graph.abstract_graph""" - - def __init__(self, graphfile=None): - """Creates an empty graph if no graphfile provided - Loads graph from graph file in format defined in load_graphfile""" - abstract_graph.__init__(self) - self.vhash = {} - if graphfile is not None: - self.load_graphfile(graphfile) - - def has_vertex(self, chrom, pos, strand): - vtemp = breakpoint_vertex(chrom, pos, strand) - if vtemp.__hash__() in self.vhash: - return self.vhash[vtemp.__hash__()] - else: - return None - - def new_vertex(self, chrom, pos, strand): - """Create, add and return new breakpoint_vertex if similar vertex not already present""" - v = self.has_vertex(chrom, pos, strand) - if v is not None: - return v - v = breakpoint_vertex(chrom, pos, strand, graph=self) - self.vhash[v.__hash__()] = v - return v - - def new_edge(self, v1, v2, edge_type="discordant", hom=None, hom_seq=None): - """Create, add and return breakpoint_edge to current graph. Recommend using "add_edge()". "new_edge()" may incorrectly add duplicate edges - Arguments: - v1,v2: breakpoint_vertex (These need to be vertices(objects) from current breakpoint graph) - edge_type = "breakpoint"/"discordant"/"concordant"/"source"/"sequence" """ - return breakpoint_edge(v1, v2, graph=self, edge_type=edge_type, hom=hom, hom_seq=hom_seq) - - def add_vertex(self, v): - """Create and add new vertex to graph if no similar vertex exists""" - return self.new_vertex(v.chrom, v.pos, v.strand) - - def add_edge(self, e, edge_type="discordant"): - """Add and return edge similar e to the graph. If e(object) already belongs to graph, return e. - Checks if corresponding vertices already present else return None. - If edge_type not defined, then inherits e.edge_type. - """ - if e.edge_type is not None: - edge_type = e.edge_type - if e.graph is not self: - v1 = self.has_vertex(e.v1.chrom, e.v1.pos, e.v1.strand) - v2 = self.has_vertex(e.v2.chrom, e.v2.pos, e.v2.strand) - if v1 is None or v2 is None: - return None - return self.new_edge(v1, v2, edge_type=edge_type, hom=e.hom, hom_seq=e.hom_seq) - return e - - def load_graphfile(self, graphfile): - """Load breakpoint_graph from file - Format: edge_type edge_string edge_copycount - """ - graphfile_handle = open(graphfile) - ll = [l.strip().split() for l in graphfile_handle] - graphfile_handle.close() - self.copy_count = defaultdict(lambda: 0, {}) - for l in ll: - if len(l) == 0: - continue - if l[0] == "sequence": - v1 = self.add_vertex(breakpoint_vertex(l[1])) - v2 = self.add_vertex(breakpoint_vertex(l[2])) - e = self.new_edge(v1, v2, edge_type="sequence") - self.copy_count[e] = float(l[3]) - if l[0] == "concordant": - e = self.add_edge(breakpoint_edge(l[1], edge_type=l[0])) - self.copy_count[e] = float(l[2]) - if l[0] == "source" or l[0] == "discordant" or l[0] == "breakpoint": - e = self.add_edge(breakpoint_edge(l[1], edge_type="discordant")) - self.copy_count[e] = float(l[2]) - return - - def djikstra_distance(self, v1, v2, min_count=0): - """Find shortest genomic path and distance between genomic locations (including strand) in the breakpoint graph with copy count = min_count. - Return none if not found - Return format: - (distance, path, traversal_copy_count) - distance: INT describing number of base-pairs in intermediate region - path: list of alternating (sequence edge, strand(1/-1)) and (breakpoint edge, strand(1,-1)) such that sequence edges in first/last entries contain v1/v2 - """ - for e in self.es.values(): - if e.v1.chrom == v1.chrom and e.v1.pos <= v1.pos and v1.pos <= e.v2.pos: - e1 = e - if e.v1.chrom == v2.chrom and e.v1.pos <= v2.pos and v2.pos <= e.v2.pos: - e2 = e - if self.copy_count[e1] < min_count or self.copy_count[e2] < min_count: - return None - if v1.strand == v2.strand and e1 == e2 and (v2.pos - v1.pos) * v1.strand > 0: - return (abs(v1.pos - v2.pos - 1), [(e1, v1.strand)], self.copy_count[e1]) - if v1.strand == 1: - distance = e1.v2.pos - v1.pos - else: - distance = v1.pos - e1.v1.pos - a = [(distance, [(e1, v1.strand)], self.copy_count[e1])] - heapq.heapify(a) - while len(a) > 0: - d, path, cc = heapq.heappop(a) - e, s = path[-1] - if s == 1: - e_new = e.v2.elist - v = e.v2 - else: - e_new = e.v1.elist - v = e.v1 - e_new = [e_next for e_next in e_new if e_next.edge_type != "sequence"] - e_search = [] - for en in e_new: - min_c = min(cc, self.copy_count[en]) - if min_c < min_count: - continue - if v == en.v1: - en_strand = 1 - v_seq = en.v2 - else: - en_strand = -1 - v_seq = en.v1 - if (en, en_strand) in path: - continue - if (en, -1 * en_strand) in path: - min_c = min(min_c, self.copy_count[en] / 2.0) - if min_c < min_count: - continue - en_seq, en_seqstrand = [ - (es, 1 if v_seq == es.v1 else -1) for es in v_seq.elist if es.edge_type == "sequence" - ][0] - min_c = min(min_c, self.copy_count[en_seq]) - if min_c < min_count: - continue - if (en_seq, en_seqstrand) in path and not (en_seq == e1 and e1 == e2 and en_seqstrand == v1.strand): - continue - if (en_seq, -1 * en_seqstrand) in path: - min_c = min(self.copy_count[en_seq] / 2.0, min_c) - if min_c < min_count: - continue - if en_seq == e2 and v2.strand == en_seqstrand: - if v2.strand == 1: - dd = d + v2.pos - e2.v1.pos - else: - dd = d + e2.v2.pos - v2.pos - return (dd, path + [(en, en_strand), (en_seq, en_seqstrand)], min_c) - heapq.heappush( - a, (d + en_seq.v2.pos - en_seq.v1.pos + 1, path + [(en, en_strand), (en_seq, en_seqstrand)], min_c) - ) - return None - - def cycle_decomposition(self, w, s): - """ - Decompose breakpoint_graph into 'simple' cycles. - Simple cycles may contain a sequence edge atmost once along each strand. - Reports maximum parsimonious cycles starting from thickest cycle until 80% of genomic content is covered. - w is dict containing weights (counts) of edges - s is source vertex, this vertex has the exception of not having a sequence edge attached""" - - def thickest_cycle(hce, wehc): - # print hce, wehc - v1 = hce[1].v1 - a = [(-1 * hce[0], v1)] - heapq.heapify(a) - hdict = {v1: (hce[0], [hce[1]], None, set())} - seenSet = set() - seenEdges = set() - completed = False - while len(a) > 0 and not completed: - # print len(a), str(a[0]), str(hdict[a[0][1]]) - v1w, v1 = heapq.heappop(a) - if v1 == hce[1].v1 and v1 in seenSet: - completed = True - break - for e in v1.elist: - if e.edge_type == "sequence": - continue - else: - v2 = e.neighbor(v1) - if v2 == s: - v3 = v2 - if e in hdict[v1][3]: - nw = min(hdict[v1][0], wehc[e] / 2) - else: - nw = min(hdict[v1][0], wehc[e]) - if not v3 in hdict or hdict[v3][2] is None or hdict[v3][0] < nw: - nhdict = hdict[v1][3].copy() - nhdict.add(e) - hdict[v3] = (nw, [e], v1, nhdict) - seenEdges.add(e) - else: - for e2 in v2.elist: - if e2.edge_type == "sequence": - se = e2 - v3 = e2.neighbor(v2) - break - if e in hdict[v1][3]: - # print 'e is seen', e, seenEdges - nw = min(hdict[v1][0], wehc[e] / 2) - elif se in hdict[v1][3]: - # print 'se is seen', se, seenEdges - nw = min(hdict[v1][0], wehc[e], wehc[se] / 2) - else: - nw = min(hdict[v1][0], wehc[e]) - if not v3 in hdict or hdict[v3][2] is None or hdict[v3][0] < nw: - nhdict = hdict[v1][3].copy() - nhdict.add(e) - nhdict.add(se) - hdict[v3] = (nw, [e, se], v1, nhdict) - # print 'seen edges', e, se, v3, hdict[v3] - seenEdges.add(e) - seenEdges.add(se) - if v3 in seenSet: - continue - seenSet.add(v3) - heapq.heappush(a, (-1 * hdict[v3][0], v3)) - if len(a) == 0 and not completed: - print("NOT COMPLETED", hce[1].v1) - s2Set = set() - tc = hdict[hce[1].v1][1] - v2 = hdict[hce[1].v1][2] - while v2 != hce[1].v1: # and not v2 in s2Set: - # print hce[1].v1, v2, s2Set - s2Set.add(v2) - if v2 not in hdict: - print(str(v2), str(hce[1].v1), str(tc)) - for ee in hce[1].v1.elist: - print(str(ee), wehc[ee]) - tc = hdict[v2][1] + tc - v2 = hdict[v2][2] - s2Set.add(v2) - # print v2, tc - return tc, hdict[hce[1].v1][0] - - total_amplicon_content = sum([(e.v2.pos - e.v1.pos) * w[e] for e in w if e.edge_type == "sequence"]) - amplicon_content_covered = 0 - w2 = w.copy() - cycle_number = 1 - cycle_list = [] - while max(w2.values()) > 0.1: - we = [(w2[e], e) for e in w2] - we.sort() - wer = we[::-1] - we = wer - wei = 0 - tcwmax = -1 - tcmax = None - tchwmax = -1 - tchmax = None - tchw = -1 - - # print "EEEEEEEEEEEEEE", len(w2) - # for e in w2: - # print "EEEEEEEEEEEE", str(e), e.edge_type, w2[e] - # print "EEEEEEEEE========================" - while wei < len(we): # and (tcwmax == -1 or we[wei][0] >= tcwmax / 2.0): - # if we[wei][1].edge_type == 'sequence': - # wei += 1 - # continue - if w2[we[wei][1]] < 0.1: - wei += 1 - continue - tc, tcw = thickest_cycle(we[wei], w2) - if len(tc) < 2: - print(str(tc[0])) - exit() - if tcw > tcwmax: - tcmax = tc - tcwmax = tcw - # sumlen = sum([abs(e.v1.pos - e.v2.pos) for e in tc if e.edge_type == 'sequence']) - # if sumlen * tcw > tchwmax: - # tchwmax = sumlen * tcw - # tchmax = tc - # tchw = tcw - wei += 1 - if tcwmax == -1: - break - tc = tcmax - tcw = tcwmax - # tc = tchmax - # tcw = tchw - if -1 in [e.v1.pos for e in tc] + [e.v2.pos for e in tc]: - csource = 0 - for ci in range(len(tc) - 1): - if -1 in [tc[ci].v1.pos, tc[ci].v2.pos] and -1 in [tc[ci + 1].v1.pos, tc[ci + 1].v2.pos]: - csource = ci + 1 - tc = tc[ci + 1 :] + tc[0 : ci + 1] - break - if tc[0].v1 == tc[1].v1 or tc[0].v1 == tc[1].v2: - v2 = tc[0].v1 - v1 = tc[0].v2 - else: - v2 = tc[0].v2 - v1 = tc[0].v1 - for ci in range(len(tc)): - if tc[ci].v1.pos == v1.pos: - v2 = tc[ci].v2 - else: - v2 = tc[ci].v1 - if tc[ci].edge_type == "sequence": - if v1.pos > v2.pos: - tc = tc[::-1] - break - v1 = v2 - else: - if tc[0].v1 == tc[1].v1 or tc[0].v1 == tc[1].v2: - v2 = tc[0].v1 - v1 = tc[0].v2 - else: - v2 = tc[0].v2 - v1 = tc[0].v1 - for ci in range(len(tc)): - if tc[ci].v1.pos == v1.pos: - v2 = tc[ci].v2 - else: - v2 = tc[ci].v1 - if tc[ci].edge_type == "sequence": - if v1.pos > v2.pos: - tc = tc[ci::-1] + tc[:ci:-1] - break - v1 = v2 - ci = 0 - while tc[ci].type() == "concordant" or tc[ci - 1].type() == "concordant": - ci -= 1 - tc = tc[ci:] + tc[:ci] - - if tcw == 0: - print("tcw is 0") - break - print("Cycle ", cycle_number, ": Copy count = ", tcw, tc) - cycle_edge_list = [] - ci = 1 - v0 = None - v0c = None - if tc[0].v1 == tc[1].v1 or tc[0].v1 == tc[1].v2: - v2 = tc[0].v1 - v1 = tc[0].v2 - else: - v2 = tc[0].v2 - v1 = tc[0].v1 - if tc[0].edge_type == "sequence": - v0 = v1 - v0c = v2 - elif v1.pos == -1 or v2.pos == -1: - print(v1, "->", v2) - cycle_edge_list.append((v1, v2)) - v1 = v2 - while ci < len(tc): - if (tc[ci].v1.chrom, tc[ci].v1.pos, tc[ci].v1.strand) == (v1.chrom, v1.pos, v1.strand): - v2 = tc[ci].v2 - else: - v2 = tc[ci].v1 - if v1.pos == -1 or v2.pos == -1: - if v0 is not None: - print(v0, "->", v0c) - cycle_edge_list.append((v0, v0c)) - print(v1, "->", v2) - cycle_edge_list.append((v1, v2)) - v0 = None - v0c = None - elif tc[ci].edge_type == "sequence": - if v0 is None: - v0 = v1 - v0c = v2 - else: - v0c = v2 - elif tc[ci].type() != "concordant": - if v0 is not None: - print(v0, "->", v0c) - cycle_edge_list.append((v0, v0c)) - v0 = None - v0c = None - v1 = v2 - ci += 1 - if v0 is not None: - print(v0, "->", v0c) - cycle_edge_list.append((v0, v0c)) - if amplicon_content_covered <= 0.9 * total_amplicon_content or (tcw > 0.2 * cycle_list[0][1]): - cycle_list.append([cycle_number, tcw, tc, cycle_edge_list]) - acc = tcw * sum([abs(e[1].pos - e[0].pos) for e in cycle_edge_list if -1 not in [e[0].pos, e[1].pos]]) - amplicon_content_covered += acc - cycle_number += 1 - # print tcw, tc - for e in tc: - w2[e] = w2[e] - tcw - # if w2[e] == 0.0: - # w2.pop(e) - if amplicon_content_covered > total_amplicon_content: - break - - segment_list = [] - for c in cycle_list: - max_segment = c[3][0] - max_orientation = "+" - max_segi = 0 - segi = 0 - for e in c[3]: - if (-1 in (max_segment[0].pos, max_segment[1].pos) and -1 not in (e[0].pos, e[1].pos)) or ( - abs(e[0].pos - e[1].pos) >= abs(max_segment[0].pos - max_segment[1].pos) - ): - max_segment = e - max_segi = segi - if e[0].pos + 0.4 * e[0].strand <= e[1].pos + 0.4 * e[1].strand: - max_orientation = "+" - else: - max_orientation = "-" - if e[0].pos + 0.4 * e[0].strand <= e[1].pos + 0.4 * e[1].strand: - if e not in segment_list: - segment_list.append(e) - else: - if (e[1], e[0]) not in segment_list: - segment_list.append((e[1], e[0])) - segi += 1 - if max_orientation == "+": - c[3] = c[3][max_segi:] + c[3][:max_segi] - else: - c[3] = [(e[1], e[0]) for e in c[3][: max_segi + 1][::-1] + c[3][max_segi + 1 :][::-1]] - - segment_list.sort() - segi = 1 - segment_index = {} - for s in [ss for ss in segment_list if ss[0].pos != -1 and ss[1].pos != -1]: - segment_index[s] = segi - segi += 1 - cycle_logger.info("List of cycle segments") - for s in [ss for ss in segment_list if ss[0].pos == -1 or ss[1].pos == -1]: - segment_index[s] = 0 - for s in [ss for ss in segment_list if ss[0].pos != -1 and ss[1].pos != -1]: - cycle_logger.info( - "Segment\t" + "\t".join([str(segment_index[s]), s[0].chrom, str(s[0].pos), str(s[1].pos)]) - ) - for c in cycle_list: - seglist = [] - orientation_list = [] - for e in c[3]: - if e in segment_index: - seglist.append(segment_index[e]) - orientation_list.append("+") - else: - seglist.append(segment_index[(e[1], e[0])]) - orientation_list.append("-") - cycle_logger.info( - "Cycle=" - + str(c[0]) - + ";Copy_count=" - + str(c[1]) - + ";Segments=" - + ",".join([str(e[0]) + str(e[1]) for e in zip(seglist, orientation_list)]) - ) - - return None - - def __repr__(self): - return "/n".join(map(str, self.vs.values() + self.es.values())) + "\n" - - -class graph_decomposition(object): - """Class represents decomposition of a breakpoint_graph with balanced edge counts into cycles/walks - Provides methods to merge and modify cycles into larger walks to represent architecture of complex rearrangements. - """ - - def __init__(self, segment_list=None, cycle_list=None, ilist=None, file=None, file_content=None): - if file is not None or file_content is not None: - self.segment_list = hg.interval_list([]) - self.segment_dict = {} - self.cycle_dict = {} - self.ilist = hg.interval_list([]) - - if file_content: - lines = file_content.split("\n") - else: - lines = str(open(file).read().decode()).split("\n") - ll = [l.strip().split() for l in lines if len(l.strip()) > 0] - for l in ll: - if "Segment" == l[0]: - s = hg.interval(l[2], int(l[3]), int(l[4]), info=[l[1]]) - self.segment_dict[l[1]] = s - self.segment_list.append(s) - elif "Cycle=" in l[0]: - ls = l[0].split(";") - ci = ls[0].split("=")[1] - cn = float(ls[1].split("=")[1]) - cl = [] - for s in ls[2].split("=")[1].split(","): - if s[-1] == "+": - cl.append((s[:-1], 1)) - else: - cl.append((s[:-1], -1)) - self.cycle_dict[ci] = (ci, cn, cl) - elif "Interval" == l[0]: - self.ilist.append(hg.interval(l[2], int(l[3]), int(l[4]), info=[l[1]])) - elif cycle_list is None: - segment_set = hg.interval_list( - [hg.interval(ss[0], ss[1], ss[2]) for ss in {(s.chrom, s.start, s.end) for s in segment_list}] - ) - segment_set.sort() - self.segment_list = segment_set - self.segment_dict = {} - seg_id = {} - cl = [] - for s in enumerate(segment_set): - self.segment_dict[str(s[0] + 1)] = s[1] - seg_id[(s[1].chrom, s[1].start, s[1].end)] = str(s[0] + 1) - for s in segment_list: - cl.append((seg_id[(s.chrom, s.start, s.end)], s.strand)) - for ii in range(len(self.segment_list)): - s = self.segment_list[ii] - s.info = [seg_id[(s.chrom, s.start, s.end)]] - self.cycle_dict = {"1": ("1", 1, cl)} - self.ilist = hg.interval_list([s[0] for s in segment_set.merge_clusters(extend=1)]) - for ii in range(len(self.ilist)): - self.ilist[ii].info = [str(ii)] - else: - self.segment_list = segment_list - self.segment_dict = {s.info[0]: s for s in segment_list} - self.cycle_dict = {c[0]: c for c in cycle_list} - if ilist is not None: - self.ilist = ilist - else: - self.ilist = hg.interval_list([s[0] for s in segment_list.merge_clusters(extend=1)]) - for ii in range(len(self.ilist)): - self.ilist[ii].info = [str(ii)] - - def next_seg_id(self): - mi = 0 - for i in self.segment_dict: - if int(i) > mi: - mi = int(i) - return str(mi + 1) - - def next_cycle_id(self): - mi = 1 - while str(mi) in self.cycle_dict: - mi += 1 - return str(mi) - - def merge(self, c1, c2, si1, si2): - cycle1 = self.cycle_dict[c1] - cycle2 = self.cycle_dict[c2] - # check if atmost 1 cycle has source vertex - if "0" in [s[0] for s in cycle1[2]] and "0" in [s[0] for s in cycle2[2]]: - raise Exception("Cannot merge 2 cycles with source vertices") - # if cycle2 has source vertex, exchange c1,c2 - if "0" in [s[0] for s in cycle2[2]]: - (c1, c2, si1, si2, cycle1, cycle2) = (c2, c1, si2, si1, cycle2, cycle1) - if si1 == 0 or si1 == len(cycle1[2]) - 1: - raise Exception("Cannot use source segment for merging") - # check if segments overlap - if not self.segment_dict[cycle1[2][si1][0]].intersects(self.segment_dict[cycle2[2][si2][0]]): - raise Exception( - "Segments do not overlap" - + str(self.segment_dict[cycle1[2][si1][0]]) - + " " - + str(self.segment_dict[cycle2[2][si2][0]]) - ) - # cnlist: (merged cn, cycle1cn, cycle2cn) - if cycle1[1] == 0 or cycle2[1] == 0: - raise Exception("Cycle copy numbers should be > 0 to merge") - if cycle1[1] > cycle2[1]: - cnlist = (cycle2[1], cycle1[1] - cycle2[1], 0.0) - else: - cnlist = (cycle1[1], 0.0, cycle2[1] - cycle1[1]) - seg1 = self.segment_dict[cycle1[2][si1][0]] - seg2 = self.segment_dict[cycle2[2][si2][0]] - seg1_found = False - seg2_found = False - for i in self.segment_list: - if cycle1[2][si1][1] == 1 and (i.chrom, i.start, i.end) == (seg1.chrom, seg1.start, seg2.end): - seg1_found = True - ns1 = i.info[0] - overlap1 = (ns1, cycle1[2][si1][1]) - elif cycle1[2][si1][1] == -1 and (i.chrom, i.start, i.end) == (seg1.chrom, seg2.start, seg1.end): - seg1_found = True - ns1 = i.info[0] - overlap1 = (ns1, cycle1[2][si1][1]) - if cycle1[2][si1][1] == 1 and (i.chrom, i.start, i.end) == (seg1.chrom, seg2.start, seg1.end): - seg2_found = True - ns2 = i.info[0] - overlap2 = (ns2, cycle1[2][si1][1]) - elif cycle1[2][si1][1] == -1 and (i.chrom, i.start, i.end) == (seg1.chrom, seg1.start, seg2.end): - seg2_found = True - ns2 = i.info[0] - overlap2 = (ns2, cycle1[2][si1][1]) - if not seg1_found: - ns1 = self.next_seg_id() - overlap1 = (ns1, cycle1[2][si1][1]) - if cycle1[2][si1][1] == 1: - self.segment_dict[ns1] = hg.interval(seg1.chrom, seg1.start, seg2.end, info=[ns1]) - else: - self.segment_dict[ns1] = hg.interval(seg1.chrom, seg2.start, seg1.end, info=[ns1]) - self.segment_list.append(self.segment_dict[ns1]) - if not seg2_found: - ns2 = self.next_seg_id() - overlap2 = (ns2, cycle1[2][si1][1]) - if cycle1[2][si1][1] == 1: - self.segment_dict[ns2] = hg.interval(seg1.chrom, seg2.start, seg1.end, info=[ns2]) - else: - self.segment_dict[ns2] = hg.interval(seg1.chrom, seg1.start, seg2.end, info=[ns2]) - self.segment_list.append(self.segment_dict[ns2]) - cycle1_init = cycle1[2][:si1] - if not cycle1[2][si1][1]: - (overlap1, overlap2, ns1, ns2) = (overlap2, overlap1, ns2, ns1) - if cycle1[2][si1][1] == cycle2[2][si2][1]: - cycle2_span = cycle2[2][si2 + 1 :] + cycle2[2][:si2] - else: - cycle2_span = [(s[0], -1 * s[1]) for s in cycle2[2][:si2][::-1] + cycle2[2][si2 + 1 :][::-1]] - cycle1_final = cycle1[2][si1 + 1 :] - mcycle = cycle1_init + [overlap1] + cycle2_span + [overlap2] + cycle1_final - mcycle_id = self.next_cycle_id() - self.cycle_dict[mcycle_id] = (mcycle_id, cnlist[0], mcycle) - self.cycle_dict[c1] = (c1, cnlist[1], cycle1[2]) - self.cycle_dict[c2] = (c2, cnlist[2], cycle2[2]) - return - - def pivot(self, c1, si1, si2): - cycle1 = self.cycle_dict[c1] - # check if segments overlap - if not self.segment_dict[cycle1[2][si1][0]].intersects(self.segment_dict[cycle1[2][si2][0]]): - raise Exception("Segments do not overlap") - # check if segments have opposite orientation - if cycle1[2][si1][1] == cycle1[2][si2][1]: - raise Exception("Segments should be in opposite orientation") - seg1 = self.segment_dict[cycle1[2][si1][0]] - seg2 = self.segment_dict[cycle1[2][si2][0]] - seg1_found = False - seg2_found = False - for i in self.segment_list: - if (i.chrom, i.start, i.end) == (seg1.chrom, seg1.start, seg2.end): - seg1_found = True - ns1 = i.info[0] - overlap1 = (ns1, cycle1[2][si1][1]) - if (i.chrom, i.start, i.end) == (seg1.chrom, seg2.start, seg1.end): - seg2_found = True - ns2 = i.info[0] - overlap2 = (ns2, cycle1[2][si2][1]) - if not seg1_found: - ns1 = self.next_seg_id() - overlap1 = (ns1, cycle1[2][si1][1]) - self.segment_dict[ns1] = hg.interval(seg1.chrom, seg1.start, seg2.end, info=[ns1]) - self.segment_list.append(self.segment_dict[ns1]) - if not seg2_found: - ns2 = self.next_seg_id() - overlap2 = (ns2, cycle1[2][si2][1]) - self.segment_dict[ns2] = hg.interval(seg1.chrom, seg2.start, seg1.end, info=[ns2]) - self.segment_list.append(self.segment_dict[ns2]) - cycle1_init = cycle1[2][:si1] - if cycle1[2][si1][1] == -1: - (overlap1, overlap2, ns1, ns2) = ( - (overlap2[0], -1 * overlap2[1]), - (overlap1[0], -1 * overlap1[1]), - ns2, - ns1, - ) - cycle1_span = [(s[0], -1 * s[1]) for s in cycle1[2][si1 + 1 : si2][::-1]] - cycle1_final = cycle1[2][si2 + 1 :] - mcycle = cycle1_init + [overlap1] + cycle1_span + [overlap2] + cycle1_final - mcycle_id = self.next_cycle_id() - self.cycle_dict[mcycle_id] = (mcycle_id, cycle1[1], mcycle) - self.cycle_dict[c1] = (c1, 0.0, cycle1[2]) - return - - def fasta_sequence(self, cycle_list=None, outfasta=None): - if cycle_list is None: - ccnlist = [(c[1], c[0]) for c in self.cycle_dict.values()] - ccnlist.sort(reverse=True) - print(ccnlist) - cycle_list = [c[1] for c in ccnlist] - fseq = "" - if outfasta is not None: - outfile = open(outfasta, "w") - for c in cycle_list: - if outfasta is None: - fseq += ( - ">Cycle" - + c - + " Copy_count=" - + str(self.cycle_dict[c][1]) - + ";Segments=" - + ",".join([seg[0] + ("+" if seg[1] == 1 else "-") for seg in self.cycle_dict[c][2]]) - + "\n" - ) - else: - outfile.write( - ">Cycle" - + c - + " Copy_count=" - + str(self.cycle_dict[c][1]) - + ";Segments=" - + ",".join([seg[0] + ("+" if seg[1] == 1 else "-") for seg in self.cycle_dict[c][2]]) - + "\n" - ) - for s in self.cycle_dict[c][2]: - if s[0] == "0": - continue - if s[1] == 1: - if outfasta is None: - fseq += self.segment_dict[s[0]].sequence(new_fa_file=self.fa_file) - else: - outfile.write(self.segment_dict[s[0]].sequence(new_fa_file=self.fa_file)) - else: - if outfasta is None: - fseq += hg.reverse_complement(self.segment_dict[s[0]].sequence(new_fa_file=self.fa_file)) - else: - outfile.write(hg.reverse_complement(self.segment_dict[s[0]].sequence(new_fa_file=self.fa_file))) - if outfasta is None: - fseq += "\n" - else: - outfile.write("\n") - if outfasta is not None: - outfile.close() - return fseq - - def __repr__(self): - s = "" - for i in self.ilist: - s += "\t".join(["Interval", i.info[0], i.chrom, str(i.start), str(i.end)]) + "\n" - for i in self.segment_list: - s += "\t".join(["Segment", i.info[0], i.chrom, str(i.start), str(i.end)]) + "\n" - ccnlist = [(c[1], c[0]) for c in self.cycle_dict.values()] - ccnlist.sort(reverse=True) - for c in ccnlist: - s += ( - "Cycle=" - + c[1] - + ";Copy_count=" - + str(c[0]) - + ";Segments=" - + ",".join([seg[0] + ("+" if seg[1] == 1 else "-") for seg in self.cycle_dict[c[1]][2]]) - + "\n" - ) - return s diff --git a/bin/check_reference.py b/bin/check_reference.py deleted file mode 100755 index 0f047b69..00000000 --- a/bin/check_reference.py +++ /dev/null @@ -1,144 +0,0 @@ -#!/usr/bin/env python - -# Author: Jens Luebeck -# Contact: jluebeck [at] ucsd.edu -# License: BSD 2-Clause License -# Source: https://github.com/AmpliconSuite/AmpliconSuite-pipeline -# Commit: 0a8a2ff2324b15aab7cb88d310dcc458d06c0bed - -from collections import defaultdict -import logging -import subprocess -import sys - -# create the set of autosomal chromosome names in various builds. -# should be updated if a reference is added to the data repo with more than 22 autosomes, but not necessary to do so -chrom_range = [str(x) for x in range(1, 23)] -chrom_range.extend(["chr" + x for x in chrom_range]) -chrom_range.append("hpv16ref_1") # use one representative entry from the viral genome collection to catch a viral ref. -chrom_range = set(chrom_range) - - -def get_ref_fname(aa_dr_path, rname): - with open(aa_dr_path + "/" + rname + "/file_list.txt") as infile: - for line in infile: - fields = line.rstrip().rsplit() - if fields[0] == "fa_file": - return fields[1] - - logging.error("ERROR: AA data repo 'file_list.txt' not found!\n") - return None - - -# get a subset of the chromosome names/lengths from a .fai file. -def get_ref_seq_lens(ref_genome_size_file): - chr_sizes = {} - try: - with open(ref_genome_size_file) as infile: - for line in infile: - fields = line.rstrip().rsplit() - if fields[0] in chrom_range: - chr_sizes[fields[0]] = int(fields[1]) - - except IOError: - pass - - return chr_sizes - - -# read bam header and store info -def get_bam_header(bamf, samtools): - cmd = samtools + " view -H " + bamf - return subprocess.check_output(cmd, shell=True).decode("utf-8") - - -# extract sequence lengths and ids -def extract_seq_info(bam_header): - bamSeqLenD = defaultdict(int) - linelist = bam_header.rsplit("\n") - for line in (x for x in linelist if x.startswith("@SQ")): - fields = line.rstrip().rsplit()[1:] - ld = {i.rsplit(":")[0]: i.rsplit(":")[1] for i in fields if ":" in i} - bamSeqLenD[ld["SN"]] = int(ld["LN"]) - - return bamSeqLenD - - -# check if bam matches to a reference genome in terms of length and sequence name -# returns false if the same chromosome has different length in bam vs. reference -# returns false if no chromosome names are shared between bam/reference -# returns true if no shared chromosomes have different lengths and at least one chromosome is present. -def match_ref(bamSeqLenD, ref_len_d): - overlaps = 0 - for chrom, len in ref_len_d.items(): - if bamSeqLenD[chrom] > 0 and len != bamSeqLenD[chrom]: - return False - - elif len == bamSeqLenD[chrom]: - overlaps += 1 - - return overlaps - - -# check properly paired rate on bam file -def check_properly_paired(bamf, samtools): - cmd = samtools + " flagstat {} | grep 'properly paired'".format(bamf) - t = str(subprocess.check_output(cmd, shell=True).decode("utf-8")) - logging.info("\n" + bamf + ": " + t.rstrip()) - ppp = float(t.rsplit("(")[-1].rsplit("%")[0]) - if t.startswith("0 + 0"): - logging.error( - "\nERROR: IMPROPERLY GENERATED BAM FILE! No properly-paired reads were found. The most common " - "reason for this behavior is that the reference genome contained alt contigs that were not " - "indicated to the aligner. You must re-align to use AA (and many other bioinformatic tools) on" - " this data.\n\n" - ) - sys.exit(1) - - elif ppp < 95: - logging.warning( - "WARNING: BAM FILE PROPERLY PAIRED RATE IS BELOW 95%.\nQuality of data may be insufficient for AA " - "analysis. Poorly controlled insert size distribution during sample prep can cause high fractions of read" - " pairs to be marked as discordant during alignment. Artifactual short SVs and long runtimes may occur!" - "\n" - ) - - return ppp - - -# check if the BAM reference matches to sequence names & lengths in a dictionary of .fai files -# returns the name of the reference genome the BAM matches to, or prints error and returns None. -def check_ref(bamf, ref_to_fai_dict, samtools): - bam_header = get_bam_header(bamf, samtools) - bamSeqLenD = extract_seq_info(bam_header) - bestref = None - bestrefhits = 0 - for refName, fai_path in ref_to_fai_dict.items(): - ref_len_d = get_ref_seq_lens(fai_path) - matched = match_ref(bamSeqLenD, ref_len_d) - if matched: - if matched > bestrefhits: - bestref = refName - bestrefhits = matched - - elif bestref and matched == bestrefhits and "_viral" in bestref and "_viral" not in refName: - bestref = refName - bestrefhits = matched - - if bestref: - logging.info("Matched " + bamf + " to reference genome " + bestref) - return bestref - - em1 = "ERROR: Could not match BAM to a known AA reference genome!\n" - em2 = """This may happen if 1) The value provided to optional argument '--ref' does not match the - reference the BAM is aligned to, or 2) The corresponding AA data repo folder for this reference - is not present, or 3) The BAM uses a different chromosome naming convention (e.g. accession - numbers instead of chromosome names). Consider inspecting the header of the BAM file and the AA - data repo directory.\n""" - - logging.error(em1) - logging.error(em2) - sys.stderr.write(em1) - sys.stderr.write(em2) - - return None diff --git a/bin/cnv_prefilter.py b/bin/cnv_prefilter.py deleted file mode 100644 index 495c85e7..00000000 --- a/bin/cnv_prefilter.py +++ /dev/null @@ -1,216 +0,0 @@ -#!/usr/bin/env python - -# Author: Jens Luebeck -# Contact: jluebeck [at] ucsd.edu -# License: BSD 2-Clause License -# Source: https://github.com/AmpliconSuite/AmpliconSuite-pipeline -# Commit: 0a8a2ff2324b15aab7cb88d310dcc458d06c0bed - -from collections import defaultdict -import logging -import os - -from intervaltree import IntervalTree - - -def merge_intervals(usort_intd, cn_cut=4.5, tol=1, require_same_cn=False, ref=None): - merged_intd = defaultdict(IntervalTree) - for chrom, usort_ints in usort_intd.items(): - # sort ints - sort_ints = sorted( - [x for x in usort_ints if x[2] > cn_cut or (ref == "GRCh38_viral" and not chrom.startswith("chr"))] - ) - if not sort_ints: - continue - - # merge sorted ints - mi = [sort_ints[0]] - for ival in sort_ints[1:]: - pass_cn_check = True - if require_same_cn and not ival[2] == mi[-1][2]: - pass_cn_check = False - - if ival[0] <= mi[-1][1] + tol and pass_cn_check: - ui = (mi[-1][0], max(ival[1], mi[-1][1]), mi[-1][2]) - mi[-1] = ui - - else: - mi.append(ival) - - for x in mi: - merged_intd[chrom].addi(x[0], x[1], x[2]) - - return merged_intd - - -# create an interval list (chrom, start, end, CN) from a dict of interval trees. -def ivald_to_ilist(ivald): - ivals = [] - for chrom, ivalt in ivald.items(): - for ival in ivalt: - ivals.append((chrom, ival.begin, ival.end, ival.data)) - - return ivals - - -# takes list of tuples (chrom, start, end, cn) -def compute_cn_median(cnlist, armlen): - cnsum = sum([x[2] - x[1] for x in cnlist]) - if cnsum < 0.5 * armlen: - return 2.0 - - halfn = cnsum / 2.0 - scns = sorted(cnlist, key=lambda x: x[3]) - rt = 0 - ccn = 0 - for x in scns: - ccn = x[3] - rt += x[2] - x[1] - if rt >= halfn: - break - - return ccn - - -def read_bed(ifname, keepdat=False): - beddict = defaultdict(IntervalTree) - with open(ifname) as infile: - for line in infile: - line = line.rstrip() - if line: - fields = line.rsplit() - s, e = int(fields[1]), int(fields[2]) - if e - s == 0: - logging.warning("Size 0 interval found. Skipping: " + line) - continue - - if keepdat: - beddict[fields[0]].addi(s, e, tuple(fields[3:])) - else: - beddict[fields[0]].addi(s, e) - - return beddict - - -# read regions to split on/filter into dictionary of interval trees, where keys are chromosomes -def read_gain_regions(ref): - AA_DATA_REPO = os.environ["AA_DATA_REPO"] + "/" + ref + "/" - fdict = {} - with open(AA_DATA_REPO + "file_list.txt") as infile: - for line in infile: - line = line.rstrip() - if line: - fields = line.rsplit() - fdict[fields[0]] = fields[1] - - grf = AA_DATA_REPO + fdict["conserved_regions_filename"] - gain_regions = read_bed(grf) - - return gain_regions - - -def get_continuous_high_regions(bedfile, cngain): - raw_input = defaultdict(list) - with open(bedfile) as infile: - for line in infile: - fields = line.rstrip().rsplit("\t") - c, s, e = fields[0], int(fields[1]), int(fields[2]) + 1 - cn = float(fields[-1]) - raw_input[c].append((s, e, cn)) - - return merge_intervals(raw_input, cn_cut=cngain, tol=300000) - - -# take CNV calls (as bed?) - have to update to not do CNV_GAIN -# input bed file, centromere_dict -# output: path of prefiltered bed file -def prefilter_bed(bedfile, ref, centromere_dict, chr_sizes, cngain, outdir): - # interval to arm lookup - region_ivald = defaultdict(IntervalTree) - for key, value in chr_sizes.items(): - try: - cent_tup = centromere_dict[key] - region_ivald[key].addi(0, int(cent_tup[0]), key + "p") - region_ivald[key].addi(int(cent_tup[1]), int(value), key + "q") - - # handle mitochondrial contig or other things (like viral genomes) - except KeyError: - region_ivald[key].addi(0, int(value), key) - - # store cnv calls per arm - arm2cns = defaultdict(list) - arm2lens = defaultdict(int) - with open(bedfile) as infile: - for line in infile: - fields = line.rstrip().rsplit("\t") - c, s, e = fields[0], int(fields[1]), int(fields[2]) + 1 - if c == "hs37d5": - continue - - cn = float(fields[-1]) - a = region_ivald[c][(s + e) // 2] - if not a: - a = region_ivald[c][s:e] - - if a: - carm_interval = a.pop() - carm = carm_interval.data - arm2cns[carm].append((c, s, e, cn)) - arm2lens[carm] = carm_interval.end - carm_interval.begin - - else: - arm2cns["other"].append((c, s, e, cn)) - logging.debug("Did not match " + c + ":" + str(s) + "-" + str(e) + " to a known chromosome arm!") - - continuous_high_region_ivald = get_continuous_high_regions(bedfile, cngain) - cn_filt_entries = [] - for a in sorted(arm2cns.keys()): - # compute the median CN of the arm - init_cns = arm2cns[a] - med_cn = compute_cn_median(init_cns, arm2lens[a]) - for x in init_cns: - long_seed_region_penalty_mult = 1.0 - # ignore CN segments over 30 Mbp - if x[2] - x[1] > 30000000: - continue - - # penalize segments over 20 Mbp - elif x[2] - x[1] > 20000000: - long_seed_region_penalty_mult = 2.0 - - continuous_high_hits = continuous_high_region_ivald[x[0]][x[1] : x[2]] - if continuous_high_hits: - for y in continuous_high_hits: - # penalize seeds that overlap a high-CN region of 10 Mbp or more - if y.end - y.begin > 10000000: - long_seed_region_penalty_mult = max(1.5, long_seed_region_penalty_mult) - - ccg = cngain * long_seed_region_penalty_mult - if x[3] > med_cn + ccg - 2: - cn_filt_entries.append(x) - - elif ref == "GRCh38_viral" and not x[0].startswith("chr") and x[3] >= 1: - cn_filt_entries.append(x) - - gain_regions = read_gain_regions(ref) - # now remove regions based on filter regions - filt_ivald = defaultdict(IntervalTree) - for x in cn_filt_entries: - cit = IntervalTree() - cit.addi(x[1], x[2]) - bi = gain_regions[x[0]] - for y in bi: - cit.slice(y.begin) - cit.slice(y.end) - - for p in sorted(cit): - filt_ivald[x[0]].addi(p[0], p[1], x[3]) - - merged_filt_ivald = merge_intervals(filt_ivald, cn_cut=cngain, require_same_cn=True, ref=ref) - final_filt_entries = ivald_to_ilist(merged_filt_ivald) - bname = outdir + "/" + bedfile.rsplit("/")[-1].rsplit(".bed")[0] + "_pre_filtered.bed" - with open(bname, "w") as outfile: - for entry in final_filt_entries: - outfile.write("\t".join([str(x) for x in entry]) + "\n") - - return bname diff --git a/bin/collect_seeds.py b/bin/collect_seeds.py deleted file mode 100755 index 35e74a43..00000000 --- a/bin/collect_seeds.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python - -# Code adapted from PrepareAA (https://github.com/jluebeck/PrepareAA) -# commit/92b81ba55356af85958985b8f80308c8f88921ac - - -import argparse -from datetime import datetime -from subprocess import call - - -# Read the CNVkit .cns files -def collect_seeds(sample, cns): - with open(cns) as infile, open(sample + "_CNV_GAIN.bed", "w") as outfile: - head = next(infile).rstrip().rsplit("\t") - for line in infile: - fields = line.rstrip().rsplit("\t") - s, e = int(fields[1]), int(fields[2]) - cn_r = float(fields[4]) - cn = 2 ** (cn_r + 1) - if cn >= args.cngain: # do not filter on size since amplified_intervals.py will merge small ones. - outline = "\t".join(fields[0:3] + ["CNVkit", str(cn)]) + "\n" - outfile.write(outline) - return sample + "_CNV_GAIN.bed" - - -# MAIN # -if __name__ == "__main__": - # Parses the command line arguments - parser = argparse.ArgumentParser(description="Collect AmpliconArchitect Copy Number Seeds") - parser.add_argument("-s", "--sample", help="sample name", required=True) - parser.add_argument("--cns", help="CNVKit .cns file of CNV changes.", default="") - parser.add_argument( - "--cngain", - type=float, - help="CN gain threshold to consider for AA seeding", - default=4.5, - ) - args = parser.parse_args() - collect_seeds(args.sample, args.cns) diff --git a/bin/downsample.py b/bin/downsample.py deleted file mode 100755 index ec373808..00000000 --- a/bin/downsample.py +++ /dev/null @@ -1,202 +0,0 @@ -#!/usr/bin/env python - -# This software is Copyright 2017 The Regents of the University of California. All Rights Reserved. Permission to copy, modify, and distribute this software and its documentation for educational, research and non-profit purposes, without fee, and without a written agreement is hereby granted, provided that the above copyright notice, this paragraph and the following three paragraphs appear in all copies. Permission to make commercial use of this software may be obtained by contacting: -# -# Office of Innovation and Commercialization -# -# University of California -# -# La Jolla, CA 92093-0910 -# -# (858) 534-5815 -# -# invent@ucsd.edu -# -# This software program and documentation are copyrighted by The Regents of the University of California. The software program and documentation are supplied "as is", without any accompanying services from The Regents. The Regents does not warrant that the operation of the program will be uninterrupted or error-free. The end-user understands that the program was developed for research purposes and is advised not to rely exclusively on the program for any reason. -# -# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. - -# Author: Viraj Deshpande -# Contact: virajbdeshpande@gmail.com -# Source: https://github.com/jluebeck/AmpliconArchitect -# Commit: 2172cdfd5b2834f98f60a5ee77f282249e16f527 - -from time import time - -TSTART = time() -import pysam -import argparse -from time import time -import os -import matplotlib - -matplotlib.use("Agg") -import random - -import global_names - -parser = argparse.ArgumentParser(description="Reconstruct Amplicons connected to listed intervals.") -parser.add_argument( - "--bam", dest="bam", help="Coordinate sorted BAM file with index", metavar="FILE", action="store", type=str, nargs=1 -) -parser.add_argument( - "--final", - dest="final", - help="Optional Final coverage. Default is 10. If initial coverage is less than final, do nothing.", - metavar="FLOAT", - action="store", - type=float, - default=10.0, -) -parser.add_argument( - "--downsample_dir", - dest="downsample_dir", - help="Optional directory to output. Default is same as original bamfile", - metavar="DIR", - action="store", - type=str, - default="", -) -parser.add_argument( - "--cbam", - dest="cbam", - help="Optional bamfile to use for coverage calculation. Also generates new coverage bam file in downsample_dir.", - metavar="FILE", - action="store", - type=str, - default=None, -) -parser.add_argument( - "--cbed", - dest="cbed", - help="Optional bedfile defining 1000 10kbp genomic windows for coverage calcualtion", - metavar="FILE", - action="store", - type=str, - default=None, -) -parser.add_argument( - "--ref", - dest="ref", - help='Values: [hg19, GRCh37, GRCh38, GRCh38_viral, mm10, None]. "hg19", "mm10", "GRCh38" : chr1, .. chrM etc / "GRCh37" : \'1\', \'2\', .. \'MT\' etc/ "None" : Do not use any annotations. AA can tolerate additional chromosomes not stated but accuracy and annotations may be affected.', - metavar="STR", - action="store", - type=str, - required=True, -) -parser.add_argument( - "--cstats_only", - help="Compute the coverage statistics for the BAM file and exit. Do not perform any downsampling.", - action="store_true", -) -parser.add_argument( - "--random_seed", - dest="random_seed", - help="Set flag to use the numpy default random seed (sets np.random.seed(seed=None)), otherwise will use seed=0", - action="store_true", - default=False, -) - -args = parser.parse_args() - -global_names.REF = args.ref -global_names.TSTART = TSTART -if args.random_seed: - global_names.SEED = None - -import bam_to_breakpoint as b2b -from breakpoint_graph import * - - -if os.path.splitext(args.bam[0])[-1] == ".cram": - bamFile = pysam.Samfile(args.bam[0], "rc") -else: - bamFile = pysam.Samfile(args.bam[0], "rb") -cbam = None -if args.cbam is not None: - if os.path.splitext(args.cbam[0])[-1] == ".cram": - cbam = pysam.Samfile(args.cbam, "rc") - else: - cbam = pysam.Samfile(args.cbam, "rb") -cbed = args.cbed - - -coverage_stats_file = open(hg.DATA_REPO + "/coverage.stats") -cstats = None -cb = bamFile -if cbam is not None: - cb = cbam - -for l in coverage_stats_file: - ll = l.strip().split() - bamfile_pathname = str(cb.filename.decode()) - if ll[0] == os.path.abspath(bamfile_pathname): - bamfile_filesize = os.path.getsize(bamfile_pathname) - - cstats = tuple(map(float, ll[1:])) - if len(cstats) < 15 or cstats[13] != 3 or bamfile_filesize != int(cstats[14]): # 3 is default sdevs - cstats = None - -coverage_stats_file.close() -coverage_windows = None -if cbed is not None: - coverage_windows = hg.interval_list(cbed, "bed") - coverage_windows.sort() -if cstats is None and cbam is not None: - cbam2b = b2b.bam_to_breakpoint(cbam, coverage_stats=cstats, coverage_windows=coverage_windows) - cstats = cbam2b.basic_stats -elif cstats is None: - bamFileb2b = b2b.bam_to_breakpoint(bamFile, coverage_stats=cstats, coverage_windows=coverage_windows) - cstats = bamFileb2b.basic_stats - -print("Estimated bamfile coverage is ", str(cstats[0])) -if args.cstats_only: - sys.exit(0) - -final = args.final - -if cstats[0] <= final: - exit() -ratio = float(final) / float(cstats[0]) - -print( - "Downsampling:", - args.bam[0], - "Estimated original coverage:", - float(cstats[0]), - "Desired final coverage:", - final, - "DS ratio:", - ratio, -) - -downsample_dir = os.path.dirname(os.path.abspath(args.bam[0])) -if args.downsample_dir != "": - downsample_dir = args.downsample_dir - -i = 0 -rulist = [] -t0 = time() -b2 = pysam.Samfile(downsample_dir + "/" + os.path.basename(args.bam[0])[:-4] + ".DS.bam", "wb", template=bamFile) - -seed_shift = str(t0) -if global_names.SEED is not None: - seed_shift = str(global_names.SEED) - -for a in bamFile.fetch(): - random.seed(a.query_name + seed_shift) - - ru = random.uniform(0, 1) - if ru < ratio: - b2.write(a) -b2.close() -pysam.index(downsample_dir + "/" + os.path.basename(args.bam[0])[:-4] + ".DS.bam") - -# if args.cbam is not None and not os.path.exists(downsample_dir + '/' + os.path.basename(args.cbam)[:-4] + '.DS.bam'): -# c2 = pysam.Samfile(downsample_dir + '/' + os.path.basename(args.cbam)[:-4] + '.DS.bam', 'wb', template = cbam) -# for a in cbam.fetch(): -# random.seed(a.qname) -# if random.uniform(0, 1) < ratio: -# c2.write(a) -# c2.close() -# pysam.index(downsample_dir + '/' + os.path.basename(args.cbam)[:-4] + '.DS.bam') diff --git a/bin/extract_circle_SV_reads.py b/bin/extract_circle_SV_reads.py old mode 100644 new mode 100755 diff --git a/bin/global_names.py b/bin/global_names.py deleted file mode 100755 index 2cb38852..00000000 --- a/bin/global_names.py +++ /dev/null @@ -1,5 +0,0 @@ -# Source: https://github.com/jluebeck/AmpliconArchitect -# Commit: 2172cdfd5b2834f98f60a5ee77f282249e16f527 -REF = "hg19" -TSTART = 0 -SEED = 0 diff --git a/bin/hg19util.py b/bin/hg19util.py deleted file mode 100755 index f2e5b50c..00000000 --- a/bin/hg19util.py +++ /dev/null @@ -1,863 +0,0 @@ -# This software is Copyright 2017 The Regents of the University of California. All Rights Reserved. Permission to copy, modify, and distribute this software and its documentation for educational, research and non-profit purposes, without fee, and without a written agreement is hereby granted, provided that the above copyright notice, this paragraph and the following three paragraphs appear in all copies. Permission to make commercial use of this software may be obtained by contacting: -# -# Office of Innovation and Commercialization -# -# University of California -# -# La Jolla, CA 92093-0910 -# -# (858) 534-5815 -# -# invent@ucsd.edu -# -# This software program and documentation are copyrighted by The Regents of the University of California. The software program and documentation are supplied "as is", without any accompanying services from The Regents. The Regents does not warrant that the operation of the program will be uninterrupted or error-free. The end-user understands that the program was developed for research purposes and is advised not to rely exclusively on the program for any reason. -# -# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. - -# Author: Viraj Deshpande -# Contact: virajbdeshpande@gmail.com - - -##This is a suite to load reference genome (not just hg19, as filename implies), genes, exons, repeat content and perform operations on this genome, compare variants -## it handles annotations from a database and is not restricted to solely hg19 if global_names.REF is not hg19. - -import sys -from bisect import bisect_left -from collections import defaultdict -from time import clock -import pysam -import heapq -import copy -import os -import logging - -if sys.version_info < (3, 0): - from sets import Set - -import global_names - -try: - DATA_REPO = os.environ["AA_DATA_REPO"] -except: - logging.warning( - "#TIME " + "%.3f\t" % clock() + " Unable to set AA_DATA_REPO variable. Setting to working directory" - ) - DATA_REPO = "." -if DATA_REPO == "." or DATA_REPO == "": - logging.warning("#TIME " + "%.3f\t" % clock() + " AA_DATA_REPO not set or empy. Setting to working directory") - DATA_REPO = "." - -REF = global_names.REF -print("Global ref name is " + REF) - -REF_files = defaultdict(lambda: "", {}) -try: - for l in open(DATA_REPO + "/" + REF + "/file_list.txt"): - REF_files[l.strip().split()[0]] = l.strip().split()[1] -except: - logging.warning( - "#TIME " - + "%.3f\t" % clock() - + " Unable to find reference in $AA_DATA_REPO/REF/file_list.txt. Setting to empty." - ) - - -class fake_fasta(object): - def fetch(self, a=None, b=0, c=0): - return "".join(["N" for i in range(c - b + 1)]) - - -try: - fa_file = pysam.Fastafile(DATA_REPO + "/" + REF + "/" + REF_files["fa_file"]) -except: - logging.warning( - "#TIME " - + "%.3f\t" % clock() - + ' Unable to open fasta file: "' - + DATA_REPO - + "/" - + REF - + "/" - + REF_files["fa_file"] - + '". Reference sequences will be set to N.' - ) - fa_file = fake_fasta() - -chrLen_filename = DATA_REPO + "/" + REF + "/" + REF_files["chrLen_file"] -duke35_filename = DATA_REPO + "/" + REF + "/" + REF_files["duke35_filename"] -wgexclude_filename = DATA_REPO + "/" + REF + "/" + REF_files["mapability_exclude_filename"] -gene_filename = DATA_REPO + "/" + REF + "/" + REF_files["gene_filename"] -exon_filename = DATA_REPO + "/" + REF + "/" + REF_files["exon_file"] -oncogene_filename = DATA_REPO + "/" + REF + "/" + REF_files["oncogene_filename"] -centromere_filename = DATA_REPO + "/" + REF + "/" + REF_files["centromere_filename"] -conserved_regions_filename = DATA_REPO + "/" + REF + "/" + REF_files["conserved_regions_filename"] -segdup_filename = DATA_REPO + "/" + REF + "/" + REF_files["segdup_filename"] -complementary_nucleotide = defaultdict( - lambda: "N", - { - "A": "T", - "C": "G", - "G": "C", - "T": "A", - "a": "t", - "c": "g", - "g": "c", - "t": "a", - "n": "n", - "N": "N", - }, -) -duke35 = [] -duke35_exists = [True] - -# Handling chromosome names, lengths, sorting, positions and addition of new chromosomes -chr_id = {} -chrName = {} - - -def chrNum(chrname, mode="append"): - if chrname in chr_id: - return chr_id[chrname] - else: - if mode == "init": - cnum = len(chr_id) - else: - cnum = 1000000 + len(chr_id) - chr_id[chrname] = cnum - chrName[cnum] = chrname - return chr_id[chrname] - - -chrLen = defaultdict(lambda: 0, {}) -try: - for line in open(chrLen_filename): - ll = line.strip().split() - chrLen[chrNum(ll[0], mode="init")] = int(ll[1]) -except: - logging.warning( - "#TIME " + "%.3f\t" % clock() + ' Unable to open chromosome lengths file: "' + chrLen_filename + '"' - ) - -chrOffset = {} - - -def absPos(chrname, pos=0): - cnum = chrNum(chrname) - if chrNum(chrname) not in chrOffset: - chrkeys = sorted(chrName.keys()) - sumlen = sum([chrLen[c] for c in chrLen if c in chrOffset]) - for i in range(len(chrkeys)): - if chrkeys[i] not in chrOffset: - chrOffset[chrkeys[i]] = sumlen - sumlen += chrLen[chrkeys[i]] - if cnum < chrkeys[i]: - break - return chrOffset[chrNum(chrname)] + pos - - -for c in chrLen: - ap = absPos(chrName[c]) - - -def chrPos(abspos): - for c in chrOffset: - if chrOffset[c] < abspos and chrOffset[c] + chrLen[c] >= abspos: - return (chrName[c], abspos - chrOffset[c]) - return None - - -def update_chrLen(len_list): - for l in len_list: - chrLen[chrNum(l[0])] = int(l[1]) - for l in len_list: - cpos = absPos(l[0], 1) - - -def reverse_complement(seq): - return "".join([complementary_nucleotide[a] for a in seq][::-1]) - - -class interval(object): - def __init__( - self, - line, - start=-1, - end=-1, - strand=1, - file_format="", - bamfile=None, - info="", - exclude_info_string=False, - ): - self.info = "" - self.file_format = file_format - if type(line) == pysam.AlignedRead or type(line) == pysam.AlignedSegment: - self.load_pysamread(line, bamfile) - elif start == -1: - self.load_line(line, file_format, exclude_info_string=exclude_info_string) - elif end == -1: - self.load_pos(line, start, start, strand) - else: - self.load_pos(line, start, end, strand) - if len(info) > 0: - self.info = info - - def load_line(self, line, file_format, exclude_info_string=False): - if file_format == "": - if len(line.strip().split()) == 1: - self.chrom = line.split(":")[0] - self.start = int(line.split(":")[1].split("-")[0]) - if "-" not in line: - self.end = int(line.split(":")[1].split("-")[0]) - else: - self.end = int(line.split(":")[1].split("-")[1]) - if self.start < self.end: - self.strand = 1 - else: - self.strand = -1 - return - else: - file_format = "bed" - if file_format == "gff": - ll = line.strip().split() - self.chrom = ll[0] - self.start, self.end = sorted([int(float(ll[3])), int(float(ll[4]))]) - if ll[6] == "+": - self.strand = 1 - else: - self.strand = -1 - if not exclude_info_string: - self.info = {r[0 : r.find("=")]: r[r.find("=") + 1 :] for r in ll[8].strip().strip(";").split(";")} - self.info["Variant"] = ll[5] - elif file_format == "bed": - ll = line.strip().split() - self.chrom = ll[0] - if (REF == "hg19" or REF == "GRCh38") and 0 < len(self.chrom) < 3: - try: - ci = int(self.chrom) - if 0 < ci < 23: - self.chrom = "chr" + self.chrom - logging.info("Corrected chromosome name (appended 'chr') " + self.chrom + " \n") - - except ValueError: - if self.chrom in {"M", "X", "Y"}: - self.chrom = "chr" + self.chrom - else: - logging.warning("Chromosome name " + self.chrom + " may be incompatible") - - self.start, self.end = sorted([int(float(ll[1])), int(float(ll[2]))]) - if int(float(ll[2])) >= int(float(ll[1])): - self.strand = 1 - else: - self.strand = -1 - if not exclude_info_string: - self.info = ll[3:] - else: - raise (Exception("Invalid interval format" + str(line))) - - def load_pos(self, chrom, start, end, strand): - self.chrom = chrom - self.start = int(start) - self.end = int(end) - self.strand = strand - if start > end: - self.start = int(end) - self.end = int(start) - self.strand = -1 * strand - - def load_pysamread(self, line, bamfile): - if bamfile is None: - raise Exception("Interval of pysam AlignedRead without bamfile") - self.chrom = line.reference_name - self.start = line.reference_start - self.end = 0 - if line.reference_end is not None: - self.end = line.reference_end - else: - logging.warning("Reference_end for " + str(self) + " was NoneType. Setting to 0.") - - if line.is_reverse: - self.strand = -1 - else: - self.strand = 1 - - def __gt__(self, y): - if self.chrom != y.chrom: - return chrNum(self.chrom) > chrNum(y.chrom) - elif int(self.end) != int(y.end): - return int(self.end) > int(y.end) - else: - return int(self.start) > int(y.start) - - def size(self): - return self.end - self.start + 1 - - def __str__(self): - if len(str(self.info)) == 0: - return "\t".join(map(str, [self.chrom, self.start, self.end])) - elif type(self.info) == list: - return "\t".join(map(str, [self.chrom, self.start, self.end] + self.info)) - elif type(self.info) == dict: - return "\t".join( - map( - str, - [self.chrom, self.start, self.end] + [str(s) + "=" + str(self.info[s]) for s in self.info], - ) - ) - else: - return "\t".join(map(str, [self.chrom, self.start, self.end, self.info])) - - def gc_content(self): - seq = fa_file.fetch(self.chrom, self.start, self.end) - # if 'G' in seq: - # print seq, seq.count('G'), seq.count('C'), float(seq.count('G') + seq.count('C')) / len(seq) - # exit() - if len(seq) == 0: - return 0.5 - return float(seq.count("G") + seq.count("C") + seq.count("g") + seq.count("c")) / len(seq) - - def sequence(self, new_fa_file=None): - if new_fa_file is not None: - seq = new_fa_file.fetch(self.chrom, self.start, self.end + 1) - else: - seq = fa_file.fetch(self.chrom, self.start, self.end + 1) - if self.strand == 1: - return seq - else: - return "".join([complementary_nucleotide[a] for a in seq][::-1]) - - def intersects(self, n, extend=0, margin=0.0): - if margin > 0.0: - if self.intersects( - interval(n.chrom, n.start, n.end - (1 - margin) * (n.end - n.start)) - ) and self.intersects(interval(n.chrom, n.start + (1 - margin) * (n.end - n.start)), n.end): - return True - else: - s = self - if n.intersects(interval(s.chrom, s.start, s.end - (1 - margin) * (s.end - s.start))) and n.intersects( - interval(s.chrom, s.start + (1 - margin) * (s.end - s.start)), s.end - ): - return True - return False - a = [self.chrom, max(0, self.start - extend), self.end + extend] - b = [n.chrom, n.start, n.end] - if a[0] != b[0]: - return False - if (int(a[1]) - int(b[1])) * (int(a[2]) - int(b[1])) <= 0: - return True - if (int(a[1]) - int(b[2])) * (int(a[2]) - int(b[2])) <= 0: - return True - if (int(a[1]) - int(b[1])) * (int(a[1]) - int(b[2])) <= 0: - return True - if (int(a[2]) - int(b[1])) * (int(a[2]) - int(b[2])) <= 0: - return True - return False - - def intersection(self, y): - if not self.intersects(y): - return None - return interval(self.chrom, max(self.start, y.start), min(self.end, y.end)) - - def merge(self, y, extend=0): - if not self.intersects(y, extend): - return None - return interval(self.chrom, min(self.start, y.start), max(self.end, y.end)) - - def atomize(self, y): - il = interval_list([self, y]) - il.sort() - ilr = [] - if il[0].intersects(il[1]): - ilint = il[0].intersection(il[1]) - if il[0].start < il[1].start: - ilr.append((interval(il[0].chrom, il[0].start, ilint.start - 1), [il[0]])) - elif il[1].start < il[0].start: - ilr.append((interval(il[1].chrom, il[1].start, ilint.start - 1), [il[1]])) - ilr.append((ilint, il)) - if il[0].end > il[1].end: - ilr.append((interval(il[0].chrom, ilint.end + 1, il[0].end), [il[0]])) - elif il[1].end > il[0].end: - ilr.append((interval(il[1].chrom, ilint.end + 1, il[1].end), [il[1]])) - return ilr - else: - return [(il[0], [il[0]]), (il[1], [il[1]])] - - def contains(self, x, y=-1, z=-1): - if type(x) == interval: - if self.intersects(x) and self.intersection(x).size() == x.size(): - return True - else: - return False - if y != -1: - if z == -1: - z = y - if ( - self.intersects(interval(x, y, z)) - and self.intersection(interval(x, y, z)).size() == interval(x, y, z).size() - ): - return True - return False - - def filter_repeat(self): - if len(interval_list([self]).intersection(wgexclude)) > 0: - return True - if len(interval_list([self]).intersection(conserved_regions)) > 0: - return True - if self.rep_content() > 4.5: - return True - return False - - def rep_content(self): - # logging.info("#TIME " + '%.3f\t'%clock() + " rep_content: init ") - if self.chrom == "chrM" or self.chrom == "MT": - return 5.0 - if self.chrom.strip("chr") not in map(str, range(1, 23)) + ["X" + "Y"]: - return 1.0 - s34 = interval(self.chrom, self.start, max(self.start, self.end - 34)) - # logging.info("#TIME " + '%.3f\t'%clock() + " rep_content: to load duke ") - if duke35_exists[0] and len(duke35) == 0: - try: - duke35file = open(duke35_filename) - duke35.extend([l.strip() for l in duke35file]) - duke35file.close() - except: - logging.warning( - "#TIME " - + "%.3f\t" % clock() - + ' rep_content: Unable to open mapability file "' - + duke35_filename - + '".' - ) - duke35_exists[0] = False - duke35.extend(["chr_Un 0 1 1"]) - # logging.info("#TIME " + '%.3f\t'%clock() + " rep_content: duke loaded") - ictime = 0 - itime = 0 - hi = len(duke35) - 1 - lo = 0 - numiter = 0 - while hi - lo > 1: - numiter += 1 - p = (hi + lo) / 2 - ctime = clock() - m = interval(duke35[p]) - ictime += clock() - ctime - ctime = clock() - if s34.intersects(m) or m > s34: - hi = p - else: - lo = p - itime += clock() - ctime - p = lo - m = interval(duke35[p]) - sum_duke = 0 - len_duke = 0 - # logging.info("#TIME " + '%.3f\t'%clock() + " rep_content: found " + str(numiter) + " " + str(ictime) + " " + str(itime)) - while s34 > m or s34.intersects(m): - if not s34.intersects(m): - p += 1 - if p >= len(duke35) or p <= 0: - raise Exception( - "p index out of range: " - + str(p) - + " " - + str(lo) - + " " - + str(self) - + " " - + str(m) - + " " - + str(interval(duke35[lo])) - ) - m = interval(duke35[p]) - continue - repc = 5.0 if float(m.info[0]) == 0 else 1.0 / float(m.info[0]) - sum_duke += s34.intersection(m).size() * repc - len_duke += s34.intersection(m).size() - p += 1 - if p >= len(duke35): - break - m = interval(duke35[p]) - # logging.info("#TIME " + '%.3f\t'%clock() + " rep_content: done") - # exit() - if len_duke > 0: - return sum_duke / len_duke - else: - return 1.0 - - def num_unmasked(self): - if self.chrom not in fa_file.references: - return self.size() - seq = fa_file.fetch(self.chrom, self.start, self.end) - return len([c for c in seq if c in "ACGT"]) - - def segdup_uniqueness(self): - sl = interval_list([self]).intersection(segdup_list) - slsd = sum([self.intersection(i[1]).size() for i in sl]) - return float(self.size()) / (self.size() + slsd) - - def extend(self, extend_len=0): - return interval( - self.chrom, - max(0, self.start - extend_len), - min(self.end + extend_len, chrLen[chrNum(self.chrom)]), - self.strand, - ) - - -class interval_list(list, object): - def __init__(self, ilist=None, file_format=None, sort=True, exclude_info_string=False): - if ilist == None: - ilist = [] - self.file_format = file_format - if file_format in ["bed", "gff"]: - self.bed_to_list(ilist, exclude_info_string=exclude_info_string) - if file_format is None: - list.__init__(self, ilist) - if sort: - self.sort() - self.offset = None - - def bed_to_list(self, file_name, exclude_info_string=False): - if file_name is not None: - try: - f = open(file_name) - list.__init__( - self, - [ - interval( - l, - file_format=self.file_format, - exclude_info_string=exclude_info_string, - ) - for l in f - if len(l.strip().split()) > 2 and l.strip()[0] != "#" - ], - ) - f.close() - except: - logging.warning( - "#TIME " + "%.3f\t" % clock() + ' interval_list: Unable to open interval file "' + file_name + '".' - ) - - def merge_clusters(self, extend=0, margin=0.0): - ml = [] - ci = None - cl = [] - ai = 0 - cend = len(self) - for a in self[::-1]: - ai += 1 - if ci is None or not a.intersects(ci, extend, margin): - cstart = len(self) - ai + 1 - cl = self[cstart:cend] - if ci is not None: - ml.append((ci, cl)) - ci = a - cl = [] - cend = len(self) - ai + 1 - # if ai != sum([len(m[1]) for m in ml]) + 1: - # print "divergent", ai, str(a) - # exit() - ci = ci.merge(a, extend) - # cl.append(a) - cstart = 0 - cl = self[cstart:cend] - if ci is not None: - ml.append((ci, cl)) - return ml[::-1] - - def repeats(self, count=1): - activeq = [] - if activeq is None: - print("h1") - exit() - jinterval = None - ilist = [] - for a in self[::-1]: - while len(activeq) > 0 and not a.intersects(activeq[0][1]): - heapq.heappop(activeq) - if activeq is None: - print("h2") - exit() - if len(activeq) < count and jinterval is not None: - ilist.append((jinterval, copy.copy(aq))) - if activeq is None: - print("h3") - exit() - jinterval = None - heapq.heappush(activeq, (-1 * a.start, a)) - if len(activeq) >= count: - if jinterval is None: - jinterval = interval(a.chrom, activeq[0][1].start, a.end) - aq = copy.copy(activeq) - else: - jinterval.start = min(jinterval.start, activeq[0][1].start) - heapq.heappush(aq, (-1 * a.start, a)) - if jinterval is not None: - ilist.append((jinterval, copy.copy(aq))) - jinterval = None - return ilist[::-1] - - def intersection(self, l2, extend=0): - si = len(self) - 1 - l2i = len(l2) - 1 - sj = len(self) - 1 - l2j = len(l2) - 1 - il = [] - while si >= 0: - while l2i >= 0 and l2[l2i] > self[si] and not self[si].intersects(l2[l2i], extend=extend): - l2i -= 1 - l2j = l2i - while l2j >= 0 and self[si].intersects(l2[l2j], extend=extend): - il.append((self[si], l2[l2j])) - l2j -= 1 - si -= 1 - return il[::-1] - - def atomize(self, h2): - i = 0 - j = 0 - atomlist = [] - if len(self) > 0: - c1 = self[0] - if len(h2) > 0: - c2 = h2[0] - c = None - while i < len(self) or j < len(h2): - # if c is not None: - # print "%%", i, j, str(c[0]), [str(aa) for aa in c[1]], [str(aa[0]) for aa in atomlist] - # else: - # print "%%", i, j, [], [str(aa[0]) for aa in atomlist] - if c is not None: - if i < len(self) and self[i] not in c[1] and (self[i].intersects(c[0], -1) or c[0] > self[i]): - atm = self[i].atomize(c[0]) - atm = [ - ( - aa[0], - [(lambda x: c[1][0] if x == c[0] else x)(aai) for aai in aa[1]], - ) - for aa in atm - ] - # print "%i", [len(rr[1]) for rr in atm], [str(rr[0]) for rr in atm] - c = atm[-1] - i += 1 - atomlist += atm[:-1] - elif j < len(h2) and h2[j] not in c[1] and (h2[j].intersects(c[0], -1) or c[0] > h2[j]): - # print j, str(h2[j]), str(c[0]), c[0] > h2[j] - atm = c[0].atomize(h2[j]) - atm = [ - ( - aa[0], - [(lambda x: c[1][0] if x == c[0] else x)(aai) for aai in aa[1]], - ) - for aa in atm - ] - # print "%j", [len(rr[1]) for rr in atm], [str(rr[0]) for rr in atm] - c = atm[-1] - j += 1 - atomlist += atm[:-1] - else: - atomlist.append(c) - # if i < len(self) and self[i] in c[1]: - # i += 1 - # if j < len(h2) and h2[j] in c[1]: - # j += 1 - c = None - else: - if i >= len(self): - atomlist.append((h2[j], [h2[j]])) - j += 1 - elif j >= len(h2): - atomlist.append((self[i], [self[i]])) - i += 1 - else: - atm = self[i].atomize(h2[j]) - atomlist += atm[:-1] - c = atm[-1] - # if self[i] not in c[1]: - i += 1 - # if h2[j] not in c[1]: - j += 1 - if c is not None: - atomlist.append(c) - return atomlist - - def get_repeat_content(self): - try: - duke35_file = open(duke35_filename) - print("counting repeats", clock()) - self.sort() - sum_duke = [0.0 for i in self] - len_duke = [0.0 for i in self] - lno = 0 - i = 0 - j = 0 - for line in duke35_file: - lno += 1 - duke_int = interval(line) - while not (duke_int.intersects(self[i])) and duke_int > self[i]: - i += 1 - if not duke_int.intersects(self[i]) and self[i] > duke_int: - continue - j = i - repc = 5.0 if float(duke_int.info[0]) == 0 else 1 / float(duke_int.info[0]) - while j < len(self) and self[j].intersects(duke_int): - sum_duke[j] += self[j].intersection(duke_int).size() * repc - len_duke[j] += self[j].intersection(duke_int).size() - j += 1 - duke35_file.close() - return {self[i]: sum_duke[i] / len_duke[i] for i in range(len(interval_list))} - except: - logging.warning( - "#TIME " - + "%.3f\t" % clock() - + ' get_repeat_content: Unable to open mapability file "' - + duke35_filename - + '".' - ) - duke35_exists[0] = False - duke35.extend(["chr_Un 0 1 1"]) - return {self[i]: 1.0 for i in range(len(interval_list))} - - def offsets(self): - if self.offset is not None: - return self.offset - gap = 0.1 - hratio = 0.8 - - vlist = [i for i in self if chrNum(i.chrom) >= 100 and i.chrom[:3] != "chr"] - hlist = [i for i in self if chrNum(i.chrom) < 100 or i.chrom[:3] == "chr"] - v_count = len([i for i in self if chrNum(i.chrom) >= 100 and i.chrom[:3] != "chr"]) - h_count = len(self) - v_count - h_sum = sum([i.size() for i in hlist]) - v_sum = sum([i.size() for i in vlist]) - - hK = len([i for i in hlist if i.size() < h_sum * gap / max(1, h_count)]) - hS = sum([i.size() for i in hlist if i.size > h_sum * gap / max(1, h_count)]) - min_hsize = hS / (max(1, h_count) / gap - hK) - h_sum = hS + hK * min_hsize - - vK = len([i for i in vlist if i.size() < v_sum * gap / max(1, v_count)]) - vS = sum([i.size() for i in vlist if i.size > v_sum * gap / max(1, v_count)]) - min_vsize = vS / (max(1, v_count) / gap - vK) - v_sum = vS + vK * min_vsize - - offset = {} - - h_start = 0 - hscale = 1 if v_count == 0 else hratio - v_start = 0 if h_count == 0 else hratio - vscale = 1 if h_count == 0 else (1 - hratio) - - hgap = gap / h_count if h_count > 0 else 0 - vgap = gap / v_count if v_count > 0 else 0 - hpos = h_start + (hgap / 2) * hscale - vpos = v_start + (vgap / 2) * vscale - for i in hlist: - isize = max(i.size(), min_hsize) - offset[i] = (hpos, hpos + ((1 - gap) * isize / h_sum) * hscale) - hpos = hpos + ((1 - gap) * isize / h_sum + hgap) * hscale - for i in vlist: - isize = max(i.size(), min_vsize) - offset[i] = (vpos, vpos + ((1 - gap) * isize / v_sum) * vscale) - vpos = vpos + ((1 - gap) * isize / v_sum + vgap) * vscale - self.offset = offset - # for i in self: - # print str(i), offset[i], i.size(), hgap, h_sum, hscale, gap, hpos, vpos - # exit() - return offset - - def xpos(self, chrom, pos): - offset = self.offsets() - for i in self: - if i.intersects(interval(chrom, max(0, pos - 1), pos)): - o = offset[i] - return (o[1] * (pos - i.start) + o[0] * (i.end - pos)) / (i.end - i.start) - return None - - def offset_breaks(self): - offset = self.offsets() - gap = 0.1 - hratio = 0.8 - - vlist = [i for i in self if chrNum(i.chrom) >= 100 and i.chrom[:3] != "chr"] - hlist = [i for i in self if chrNum(i.chrom) < 100 or i.chrom[:3] == "chr"] - v_count = len([i for i in self if chrNum(i.chrom) >= 100 and i.chrom[:3] != "chr"]) - h_count = len(self) - v_count - h_sum = sum([i.size() for i in hlist]) - v_sum = sum([i.size() for i in vlist]) - - hscale = 1 if v_count == 0 else hratio - vscale = 1 if h_count == 0 else (1 - hratio) - - hgap = gap / h_count if h_count > 0 else 0 - vgap = gap / v_count if v_count > 0 else 0 - - breaks = [] - iprev = None - for i in self: - if iprev is None: - iprev = i - continue - if i in hlist and iprev.chrom == i.chrom: - breaks.append((offset[i][0] - hscale * hgap / 2, ":", i.chrom)) - print(str(i), str(iprev), i in hlist, iprev.chrom == i.chrom) - elif i in hlist and iprev.chrom != i.chrom: - breaks.append((offset[i][0] - hscale * hgap / 2, "--", i.chrom)) - elif i in vlist and iprev in hlist: - breaks.append((offset[i][0] - vscale * vgap / 2, "-", i.chrom)) - elif i in vlist and i.chrom == iprev.chrom: - breaks.append((offset[i][0] - vscale * vgap / 2, ":", i.chrom)) - else: - breaks.append((offset[i][0] - vscale * vgap / 2, "--", i.chrom)) - - iprev = i - return breaks - - def __str__(self): - return str(([str(i) for i in self])) - - -oncogene_list = interval_list(oncogene_filename, "gff") -oncogene_list.sort() -gene_list = interval_list(gene_filename, "gff") - - -exon_list = interval_list([]) - - -def load_exons(): - if len(exon_list) > 0: - return - try: - exon_file = open(exon_filename) - exonFields = [ - interval(j, file_format="gff") - for j in exon_file.read().strip().split("\n") - if ( - len(j.strip()) > 0 - and j.strip()[0] != "#" - and {r.split("=")[0]: r.split("=")[1] for r in j.strip().split()[8].strip(";").split(";")}["color"] - == "000080" - ) - ] - exon_file.close() - exon_list.extend((exonFields)) - except: - logging.warning("#TIME " + "%.3f\t" % clock() + 'unable to load exon file: "' + exon_filename + '"') - - -conserved_regions = interval_list(conserved_regions_filename, "bed") -conserved_regions.sort() - -wgexclude = interval_list(wgexclude_filename, "bed") -wgexclude.sort() - -centromere_list = interval_list(centromere_filename, "bed") -centromere_list.sort() -centromere_list = interval_list([i[0] for i in centromere_list.merge_clusters(extend=1)]) - - -segdup_list = interval_list(segdup_filename, "bed") -segdup_list.sort() diff --git a/bin/mosek_solver.py b/bin/mosek_solver.py deleted file mode 100644 index 36bea415..00000000 --- a/bin/mosek_solver.py +++ /dev/null @@ -1,259 +0,0 @@ -#!/usr/bin/env python - -# Author: Jens Luebeck -# Contact: jluebeck [at] ucsd.edu -# License: BSD 2-Clause License -# Source: https://github.com/AmpliconSuite/AmpliconSuite-pipeline -# Commit: 0a8a2ff2324b15aab7cb88d310dcc458d06c0bed - -# Interface to MOSEK for AmpliconArchitect for Python3 -# -# Supports all versions of MOSEK >= 8 -# -import logging -import sys - -import mosek - -# Check Mosek version -mosek_ver = mosek.Env.getversion() -logging.info("Mosek version is {}".format(".".join([str(x) for x in mosek_ver]))) -mosek_major = mosek_ver[0] - -if sys.version_info < (3, 0) and mosek_major >= 10: - logging.warning("Mosek version is " + ".".join([str(x) for x in mosek_ver]) + " which requires python3. Exiting.\n") - sys.exit(1) - - -# MOSEK logging -mosek_logger = logging.getLogger("MOSEK") - - -def moseklogfunc(msg): - mosek_logger.debug(msg.rstrip()) - - -class fusionlogger: - def write(self, msg): - moseklogfunc(msg) - - def flush(self): - pass - - -# Calls MOSEK to solve one instance of the problem -def call_mosek(n, m, asub, aval, coeff_c, coeff_f, coeff_g, const_h): - mosek_logger.info("Beginning MOSEK call") - - ## Enable this line to ALWAYS save all Mosek inputs - # save_mosek_input(n, m, asub, aval, coeff_c, coeff_f, coeff_g, const_h) - - try: - # Determine which MOSEK routing to call - if mosek_major == 8: - return call_mosek_scopt(n, m, asub, aval, coeff_c, coeff_f, coeff_g, const_h) - elif mosek_major == 9: - return call_mosek_fusion(n, m, asub, aval, coeff_c, coeff_f) - elif mosek_major >= 10: - return call_mosek_acc(n, m, asub, aval, coeff_c, coeff_f) - else: - raise Exception("Unsupported MOSEK version {}".format(mosek_major)) - except Exception as e: - # If an error occurred in the MOSEK call then save - # all input data to a JSON file so they can be loaded - # to recreate the MOSEK problem in a stand-alone way. - mosek_logger.error("Error when using MOSEK: {}".format(e)) - print("Error when using MOSEK: {}".format(e)) - filename = save_mosek_input(n, m, asub, aval, coeff_c, coeff_f, coeff_g, const_h) - mosek_logger.info( - "Saved MOSEK inputs to {}. Submit that file to support to reproduce the issue.".format(filename) - ) - raise e - - -""" -This method works with MOSEK == 8. - -Solves the problem - -minimize c^T * x - sum_i(f_i * log(g_i * x_i + h_i)) -subject to A * x == 0 - x >= 0 -""" - - -def call_mosek_scopt(n, m, asub, aval, coeff_c, coeff_f, coeff_g, const_h): - with mosek.Env() as env: - with env.Task() as task: - task.set_Stream(mosek.streamtype.log, moseklogfunc) - - numvar = n + m - numcon = 2 * n - - task.appendcons(numcon) - task.appendvars(numvar) - - task.putvarboundslice(0, numvar, [mosek.boundkey.lo] * numvar, [0.0] * numvar, [0.0] * numvar) - task.putconboundslice(0, numcon, [mosek.boundkey.fx] * numcon, [0.0] * numcon, [0.0] * numcon) - - for i in range(numcon): - task.putarow(i, asub[i], aval[i]) - - task.putclist(range(numvar), coeff_c) - - task.putobjsense(mosek.objsense.minimize) - - task.putSCeval([mosek.scopr.log] * (n + m), range(n + m), coeff_f, coeff_g, const_h) - - task.optimize() - task.solutionsummary(mosek.streamtype.log) - - if task.getsolsta(mosek.soltype.itr) != mosek.solsta.optimal: - raise Exception( - "Failed to solve to optimality. Solution status {}".format(task.getsolsta(mosek.soltype.itr)) - ) - - res = [0.0] * numvar - task.getsolutionslice(mosek.soltype.itr, mosek.solitem.xx, 0, numvar, res) - - return res - - -""" -This method works with MOSEK >= 10. - -Solves the problem - -minimize c^T * x - sum_i(f_i * log(x_i)) -subject to A * x == 0 - -Comments, compared to MOSEK 8 model: - -We ignore the normalizing coefficient h_i from log(g_i * x_i + h_i) and consider only log(g_i * x_i). -Subject to that change we can also skip g_i since it only changes the constant term in the objective. -The condition x>=0 is implicit by x appearing in the logarithm. -""" - - -def call_mosek_acc(n, m, asub, aval, coeff_c, coeff_f): - with mosek.Task() as task: - task.set_Stream(mosek.streamtype.log, moseklogfunc) - - task.appendvars(2 * (n + m)) - task.appendcons(2 * n) - task.putvarboundsliceconst(0, 2 * (n + m), mosek.boundkey.fr, 0, 0) - - for i in range(2 * n): - task.putarow(i, asub[i], aval[i]) - - task.putconboundsliceconst(0, 2 * n, mosek.boundkey.fx, 0, 0) - - task.appendafes(2 * (n + m) + 1) - task.putafefentrylist(range(0, 2 * (n + m)), range(0, 2 * (n + m)), [1.0] * (2 * (n + m))) - task.putafeg(2 * (n + m), 1.0) - - expdom = task.appendprimalexpconedomain() - task.appendaccs([expdom] * (n + m), sum([[i, 2 * (n + m), i + n + m] for i in range(n + m)], []), None) - - task.putclist(range(0, n + m), coeff_c) - task.putclist(range(n + m, 2 * (n + m)), coeff_f) - - task.putobjsense(mosek.objsense.minimize) - - task.optimize() - task.solutionsummary(mosek.streamtype.log) - - if task.getsolsta(mosek.soltype.itr) != mosek.solsta.optimal: - raise Exception( - "Failed to solve to optimality. Solution status {}".format(task.getsolsta(mosek.soltype.itr)) - ) - - return task.getxxslice(mosek.soltype.itr, 0, n + m) - - -""" -This method works with MOSEK >= 9. - -Solves the problem - -minimize c^T * x - sum_i(f_i * log(x_i)) -subject to A * x == 0 - -Comments, compared to MOSEK 10 model: - -A simple model in the higher level MOSEK Fusion. Anyhow, we do not expect MOSEK 9 users, really. -Either stay with MOSEK 8 or otherwise there is no reason not to upgrade all the way to MOSEK 10. - -This model can be used in MOSEK >= 9, but it invokes the additional Fusion modeling layer, -which the model from call_mosek_acc skips. If it behaves well though, we could make -it the default. It should be fast enough, and is more readable. -""" - - -def call_mosek_fusion(n, m, asub, aval, coeff_c, coeff_f): - from mosek.fusion import Model, Domain, Expr, Matrix, ObjectiveSense, SolutionStatus - - with Model() as M: - M.setLogHandler(fusionlogger()) - - x = M.variable(n + m) - t = M.variable(n + m) - - for i in range(2 * n): - M.constraint(Expr.dot(aval[i], x.pick(asub[i])), Domain.equalsTo(0)) - - M.constraint(Expr.hstack(x, Expr.constTerm(n + m, 1.0), t), Domain.inPExpCone()) - - M.objective(ObjectiveSense.Minimize, Expr.add(Expr.dot(coeff_c, x), Expr.dot(coeff_f, t))) - - M.solve() - - if M.getPrimalSolutionStatus() != SolutionStatus.Optimal: - raise Exception("Failed to solve to optimality. Solution status {}".format(M.getPrimalSolutionStatus())) - - return x.level() - - -# Debug functions. Dumping input data. -mosek_save_num = 1 - - -def save_mosek_input(n, m, asub, aval, coeff_c, coeff_f, coeff_g, const_h): - import json - - global mosek_save_num - filename = "mosekinput-{}.json".format(mosek_save_num) - data = { - "n": n, - "m": m, - "asub": asub, - "aval": aval, - "coeff_c": coeff_c, - "coeff_f": coeff_f, - "coeff_g": coeff_g, - "const_h": const_h, - } - - with open(filename, "w") as f: - json.dump(data, f) - - mosek_save_num += 1 - return filename - - -# Debug functions. Loading input data. -def load_mosek_input(filename): - import json - - with open(filename, "r") as f: - data = json.load(f) - return ( - data["n"], - data["m"], - data["asub"], - data["aval"], - data["coeff_c"], - data["coeff_f"], - data["coeff_g"], - data["const_h"], - ) diff --git a/bin/mycolors.py b/bin/mycolors.py deleted file mode 100755 index 56994e2e..00000000 --- a/bin/mycolors.py +++ /dev/null @@ -1,136 +0,0 @@ -# This software is Copyright 2017 The Regents of the University of California. All Rights Reserved. Permission to copy, modify, and distribute this software and its documentation for educational, research and non-profit purposes, without fee, and without a written agreement is hereby granted, provided that the above copyright notice, this paragraph and the following three paragraphs appear in all copies. Permission to make commercial use of this software may be obtained by contacting: -# -# Office of Innovation and Commercialization -# -# University of California -# -# La Jolla, CA 92093-0910 -# -# (858) 534-5815 -# -# invent@ucsd.edu -# -# This software program and documentation are copyrighted by The Regents of the University of California. The software program and documentation are supplied "as is", without any accompanying services from The Regents. The Regents does not warrant that the operation of the program will be uninterrupted or error-free. The end-user understands that the program was developed for research purposes and is advised not to rely exclusively on the program for any reason. -# -# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. - -# Author: Viraj Deshpande -# Contact: virajbdeshpande@gmail.com -# Maintained by Jens Luebeck, jluebeck@ucsd.edu -# Source: https://github.com/AmpliconSuite/AmpliconSuite-pipeline -# Commit: 0a8a2ff2324b15aab7cb88d310dcc458d06c0bed - - -chrcolor = { - "b": "b", - "g": "g", - "r": "r", - "c": "c", - "m": "m", - "y": "y", - "k": "k", - "w": "w", - "chr1": (153 / 256.0, 102 / 256.0, 0 / 256.0), - "chr2": (102 / 256.0, 102 / 256.0, 0 / 256.0), - "chr3": (153 / 256.0, 153 / 256.0, 30 / 256.0), - "chr4": (204 / 256.0, 0 / 256.0, 0 / 256.0), - "chr5": (255 / 256.0, 0 / 256.0, 0 / 256.0), - "chr6": (255 / 256.0, 0 / 256.0, 204 / 256.0), - "chr7": (255 / 256.0, 204 / 256.0, 204 / 256.0), - "chr8": (255 / 256.0, 153 / 256.0, 0 / 256.0), - "chr9": (255 / 256.0, 204 / 256.0, 0 / 256.0), - "chr10": (255 / 256.0, 255 / 256.0, 0 / 256.0), - "chr11": (204 / 256.0, 255 / 256.0, 0 / 256.0), - "chr12": (0 / 256.0, 255 / 256.0, 0 / 256.0), - "chr13": (53 / 256.0, 128 / 256.0, 0 / 256.0), - "chr14": (0 / 256.0, 0 / 256.0, 204 / 256.0), - "chr15": (102 / 256.0, 153 / 256.0, 255 / 256.0), - "chr16": (153 / 256.0, 204 / 256.0, 255 / 256.0), - "chr17": (0 / 256.0, 255 / 256.0, 255 / 256.0), - "chr18": (204 / 256.0, 255 / 256.0, 255 / 256.0), - "chr19": (153 / 256.0, 0 / 256.0, 204 / 256.0), - "chr20": (204 / 256.0, 51 / 256.0, 255 / 256.0), - "chr21": (204 / 256.0, 153 / 256.0, 255 / 256.0), - "chr22": (102 / 256.0, 102 / 256.0, 102 / 256.0), - "chr23": (153 / 256.0, 153 / 256.0, 153 / 256.0), - "chrX": (153 / 256.0, 153 / 256.0, 153 / 256.0), - "chr24": (204 / 256.0, 204 / 256.0, 204 / 256.0), - "chrY": (204 / 256.0, 204 / 256.0, 204 / 256.0), - "chrM": (204 / 256.0, 204 / 256.0, 153 / 256.0), - "chr0": (204 / 256.0, 204 / 256.0, 153 / 256.0), - "chrUn": (121 / 256.0, 204 / 256.0, 61 / 256.0), - "chrNA": (255 / 256.0, 255 / 256.0, 255 / 256.0), - "lum90chr1": (255 / 256.0, 216 / 256.0, 156 / 256.0), - "lum90chr2": (230 / 256.0, 230 / 256.0, 165 / 256.0), - "lum90chr3": (232 / 256.0, 232 / 256.0, 135 / 256.0), - "lum90chr4": (255 / 256.0, 166 / 256.0, 166 / 256.0), - "lum90chr5": (255 / 256.0, 147 / 256.0, 147 / 256.0), - "lum90chr6": (255 / 256.0, 152 / 256.0, 255 / 256.0), - "lum90chr7": (255 / 256.0, 214 / 256.0, 214 / 256.0), - "lum90chr8": (255 / 256.0, 202 / 256.0, 102 / 256.0), - "lum90chr9": (255 / 256.0, 220 / 256.0, 58 / 256.0), - "lum90chr10": (234 / 256.0, 234 / 256.0, 0 / 256.0), - "lum90chr11": (194 / 256.0, 245 / 256.0, 0 / 256.0), - "lum90chr12": (34 / 256.0, 255 / 256.0, 34 / 256.0), - "lum90chr13": (174 / 256.0, 244 / 256.0, 155 / 256.0), - "lum90chr14": (215 / 256.0, 215 / 256.0, 255 / 256.0), - "lum90chr15": (182 / 256.0, 224 / 256.0, 255 / 256.0), - "lum90chr16": (182 / 256.0, 231 / 256.0, 255 / 256.0), - "lum90chr17": (0 / 256.0, 252 / 256.0, 252 / 256.0), - "lum90chr18": (185 / 256.0, 236 / 256.0, 236 / 256.0), - "lum90chr19": (255 / 256.0, 191 / 256.0, 255 / 256.0), - "lum90chr20": (255 / 256.0, 177 / 256.0, 255 / 256.0), - "lum90chr21": (255 / 256.0, 206 / 256.0, 255 / 256.0), - "lum90chr22": (198 / 256.0, 198 / 256.0, 198 / 256.0), - "lum90chr23": (153 / 256.0, 153 / 256.0, 153 / 256.0), - "lum90chrX": (153 / 256.0, 153 / 256.0, 153 / 256.0), - "lum90chr24": (204 / 256.0, 204 / 256.0, 204 / 256.0), - "lum90chrY": (204 / 256.0, 204 / 256.0, 204 / 256.0), - "lum90chrM": (174 / 256.0, 174 / 256.0, 122 / 256.0), - "lum90chr0": (174 / 256.0, 174 / 256.0, 122 / 256.0), - "lum90chrUn": (108 / 256.0, 191 / 256.0, 38 / 256.0), - "lum90chrNA": (171 / 256.0, 171 / 256.0, 171 / 256.0), - "lum80chr1": (244 / 256.0, 188 / 256.0, 127 / 256.0), - "lum80chr2": (202 / 256.0, 202 / 256.0, 136 / 256.0), - "lum80chr3": (203 / 256.0, 203 / 256.0, 103 / 256.0), - "lum80chr4": (255 / 256.0, 137 / 256.0, 137 / 256.0), - "lum80chr5": (255 / 256.0, 116 / 256.0, 116 / 256.0), - "lum80chr6": (255 / 256.0, 119 / 256.0, 255 / 256.0), - "lum80chr7": (237 / 256.0, 186 / 256.0, 186 / 256.0), - "lum80chr8": (255 / 256.0, 174 / 256.0, 62 / 256.0), - "lum80chr9": (243 / 256.0, 192 / 256.0, 0 / 256.0), - "lum80chr10": (206 / 256.0, 206 / 256.0, 0 / 256.0), - "lum80chr11": (166 / 256.0, 216 / 256.0, 0 / 256.0), - "lum80chr12": (0 / 256.0, 232 / 256.0, 0 / 256.0), - "lum80chr13": (146 / 256.0, 216 / 256.0, 126 / 256.0), - "lum80chr14": (186 / 256.0, 186 / 256.0, 255 / 256.0), - "lum80chr15": (152 / 256.0, 196 / 256.0, 255 / 256.0), - "lum80chr16": (152 / 256.0, 203 / 256.0, 254 / 256.0), - "lum80chr17": (0 / 256.0, 224 / 256.0, 224 / 256.0), - "lum80chr18": (156 / 256.0, 208 / 256.0, 208 / 256.0), - "lum80chr19": (250 / 256.0, 161 / 256.0, 255 / 256.0), - "lum80chr20": (255 / 256.0, 146 / 256.0, 255 / 256.0), - "lum80chr21": (227 / 256.0, 177 / 256.0, 255 / 256.0), - "lum80chr22": (198 / 256.0, 198 / 256.0, 198 / 256.0), - "lum80chr23": (153 / 256.0, 153 / 256.0, 153 / 256.0), - "lum80chrX": (153 / 256.0, 153 / 256.0, 153 / 256.0), - "lum80chr24": (204 / 256.0, 204 / 256.0, 204 / 256.0), - "lum80chrY": (204 / 256.0, 204 / 256.0, 204 / 256.0), - "lum80chrM": (174 / 256.0, 174 / 256.0, 122 / 256.0), - "lum80chr0": (174 / 256.0, 174 / 256.0, 122 / 256.0), - "lum80chrUn": (108 / 256.0, 191 / 256.0, 38 / 256.0), - "lum80chrNA": (171 / 256.0, 171 / 256.0, 171 / 256.0), - "vlpurple": (218 / 256.0, 218 / 256.0, 235 / 256.0), - "vlorange": (253 / 256.0, 208 / 256.0, 162 / 256.0), - "vlpgreen": (218 / 256.0, 218 / 256.0, 235 / 256.0), -} - - -ecolor = { - "interchromosomal": "blue", - "concordant": "black", - "everted": (139 / 256.0, 69 / 256.0, 19 / 256.0), # 'brown', yellow', - "forward": "magenta", - "reverse": (0 / 256.0, 139 / 256.0, 139 / 256.0), #'cyan', - "discordant": "red", -} diff --git a/bin/realigner.py b/bin/realigner.py old mode 100644 new mode 100755 diff --git a/bin/ref_util.py b/bin/ref_util.py deleted file mode 100755 index 1ebc1cd5..00000000 --- a/bin/ref_util.py +++ /dev/null @@ -1,832 +0,0 @@ -# This software is Copyright 2017 The Regents of the University of California. All Rights Reserved. Permission to copy, modify, and distribute this software and its documentation for educational, research and non-profit purposes, without fee, and without a written agreement is hereby granted, provided that the above copyright notice, this paragraph and the following three paragraphs appear in all copies. Permission to make commercial use of this software may be obtained by contacting: -# -# Office of Innovation and Commercialization -# -# University of California -# -# La Jolla, CA 92093-0910 -# -# (858) 534-5815 -# -# invent@ucsd.edu -# -# This software program and documentation are copyrighted by The Regents of the University of California. The software program and documentation are supplied "as is", without any accompanying services from The Regents. The Regents does not warrant that the operation of the program will be uninterrupted or error-free. The end-user understands that the program was developed for research purposes and is advised not to rely exclusively on the program for any reason. -# -# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. - -# Author: Viraj Deshpande -# Contact: virajbdeshpande@gmail.com -# Maintained by Jens Luebeck, jluebeck@ucsd.edu -# Source: https://github.com/AmpliconSuite/AmpliconSuite-pipeline -# Commit: 0a8a2ff2324b15aab7cb88d310dcc458d06c0bed - - -##This is a suite to load reference genome (not just hg19, as filename implies), genes, exons, repeat content and perform operations on this genome, compare variants -## it handles annotations from a database and is not restricted to solely hg19 if global_names.REF is not hg19. - -import sys -from bisect import bisect_left -from collections import defaultdict -from time import time -import pysam -import heapq -import copy -import os -import logging - -import global_names - -REF = global_names.REF -TSTART = global_names.TSTART -print("Global ref name is " + REF) - -try: - DATA_REPO = os.environ["AA_DATA_REPO"] -except: - logging.warning( - "#TIME " + "%.3f\t" % (time() - TSTART) + " Unable to set AA_DATA_REPO variable. Setting to working directory" - ) - DATA_REPO = "." -if DATA_REPO == "." or DATA_REPO == "": - logging.warning( - "#TIME " + "%.3f\t" % (time() - TSTART) + " AA_DATA_REPO not set or empy. Setting to working directory" - ) - DATA_REPO = "." - -REF_files = defaultdict(lambda: "", {}) -try: - for l in open(DATA_REPO + "/" + REF + "/file_list.txt"): - REF_files[l.strip().split()[0]] = l.strip().split()[1] -except: - logging.warning( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + " Unable to find reference in $AA_DATA_REPO/REF/file_list.txt. Setting to empty." - ) - - -class fake_fasta(object): - def fetch(self, a=None, b=0, c=0): - return "".join(["N" for i in range(c - b + 1)]) - - -try: - fa_file = pysam.Fastafile(DATA_REPO + "/" + REF + "/" + REF_files["fa_file"]) -except: - logging.warning( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + ' Unable to open fasta file: "' - + DATA_REPO - + "/" - + REF - + "/" - + REF_files["fa_file"] - + '". Reference sequences will be set to N.' - ) - fa_file = fake_fasta() - -chrLen_filename = DATA_REPO + "/" + REF + "/" + REF_files["chrLen_file"] -duke35_filename = DATA_REPO + "/" + REF + "/" + REF_files["duke35_filename"] -wgexclude_filename = DATA_REPO + "/" + REF + "/" + REF_files["mapability_exclude_filename"] -gene_filename = DATA_REPO + "/" + REF + "/" + REF_files["gene_filename"] -exon_filename = DATA_REPO + "/" + REF + "/" + REF_files["exon_file"] -oncogene_filename = DATA_REPO + "/" + REF + "/" + REF_files["oncogene_filename"] -centromere_filename = DATA_REPO + "/" + REF + "/" + REF_files["centromere_filename"] -conserved_regions_filename = DATA_REPO + "/" + REF + "/" + REF_files["conserved_regions_filename"] -segdup_filename = DATA_REPO + "/" + REF + "/" + REF_files["segdup_filename"] -complementary_nucleotide = defaultdict( - lambda: "N", {"A": "T", "C": "G", "G": "C", "T": "A", "a": "t", "c": "g", "g": "c", "t": "a", "n": "n", "N": "N"} -) -duke35 = [] -duke35_exists = [True] - -# Handling chromosome names, lengths, sorting, positions and addition of new chromosomes -chr_id = {} -chrName = {} -chromList = [str(x) for x in range(1, 23)] + ["X" + "Y"] # must be updated if including an organism with more chroms. - - -def chrNum(chrname, mode="append"): - if chrname in chr_id: - return chr_id[chrname] - else: - if mode == "init": - cnum = len(chr_id) - else: - cnum = 1000000 + len(chr_id) - chr_id[chrname] = cnum - chrName[cnum] = chrname - return chr_id[chrname] - - -chrLen = defaultdict(lambda: 0, {}) -try: - for line in open(chrLen_filename): - ll = line.strip().split() - chrLen[chrNum(ll[0], mode="init")] = int(ll[1]) -except: - logging.warning( - "#TIME " + "%.3f\t" % (time() - TSTART) + ' Unable to open chromosome lengths file: "' + chrLen_filename + '"' - ) - -chrOffset = {} - - -def absPos(chrname, pos=0): - cnum = chrNum(chrname) - if chrNum(chrname) not in chrOffset: - chrkeys = sorted(chrName.keys()) - sumlen = sum([chrLen[c] for c in chrLen if c in chrOffset]) - for i in range(len(chrkeys)): - if chrkeys[i] not in chrOffset: - chrOffset[chrkeys[i]] = sumlen - sumlen += chrLen[chrkeys[i]] - if cnum < chrkeys[i]: - break - return chrOffset[chrNum(chrname)] + pos - - -for c in chrLen: - ap = absPos(chrName[c]) - - -def chrPos(abspos): - for c in chrOffset: - if chrOffset[c] < abspos and chrOffset[c] + chrLen[c] >= abspos: - return (chrName[c], abspos - chrOffset[c]) - return None - - -def update_chrLen(len_list): - for l in len_list: - chrLen[chrNum(l[0])] = int(l[1]) - for l in len_list: - cpos = absPos(l[0], 1) - - -def reverse_complement(seq): - return "".join([complementary_nucleotide[a] for a in seq][::-1]) - - -class interval(object): - def __init__( - self, line, start=-1, end=-1, strand=1, file_format="", bamfile=None, info="", exclude_info_string=False - ): - self.info = "" - self.file_format = file_format - if type(line) == pysam.AlignedRead or type(line) == pysam.AlignedSegment: - self.load_pysamread(line, bamfile) - elif start == -1: - self.load_line(line, file_format, exclude_info_string=exclude_info_string) - elif end == -1: - self.load_pos(line, start, start, strand) - else: - self.load_pos(line, start, end, strand) - if len(info) > 0: - self.info = info - - def load_line(self, line, file_format, exclude_info_string=False): - if file_format == "": - if len(line.strip().split()) == 1: - self.chrom = line.split(":")[0] - self.start = int(line.split(":")[1].split("-")[0]) - if "-" not in line: - self.end = int(line.split(":")[1].split("-")[0]) - else: - self.end = int(line.split(":")[1].split("-")[1]) - if self.start < self.end: - self.strand = 1 - else: - self.strand = -1 - return - else: - file_format = "bed" - if file_format == "gff": - ll = line.strip().split() - self.chrom = ll[0] - self.start, self.end = sorted([int(float(ll[3])), int(float(ll[4]))]) - if ll[6] == "+": - self.strand = 1 - else: - self.strand = -1 - if not exclude_info_string: - self.info = {r[0 : r.find("=")]: r[r.find("=") + 1 :] for r in ll[8].strip().strip(";").split(";")} - self.info["Variant"] = ll[5] - elif file_format == "bed": - ll = line.strip().split() - self.chrom = ll[0] - if (REF == "hg19" or REF == "GRCh38" or REF == "mm10" or REF == "GRCm38") and 0 < len(self.chrom) < 3: - try: - ci = int(self.chrom) - if 0 < ci < 23: - self.chrom = "chr" + self.chrom - logging.info("Corrected chromosome name (appended 'chr') " + self.chrom + " \n") - - except ValueError: - if self.chrom in {"M", "X", "Y"}: - self.chrom = "chr" + self.chrom - else: - logging.warning("Chromosome name " + self.chrom + " may be incompatible") - - self.start, self.end = sorted([int(float(ll[1])), int(float(ll[2]))]) - if int(float(ll[2])) >= int(float(ll[1])): - self.strand = 1 - else: - self.strand = -1 - if not exclude_info_string: - self.info = ll[3:] - else: - raise (Exception("Invalid interval format" + str(line))) - - def load_pos(self, chrom, start, end, strand): - self.chrom = chrom - self.start = int(start) - self.end = int(end) - self.strand = strand - if start > end: - self.start = int(end) - self.end = int(start) - self.strand = -1 * strand - - def load_pysamread(self, line, bamfile): - if bamfile is None: - raise Exception("Interval of pysam AlignedRead without bamfile") - self.chrom = line.reference_name - self.start = line.reference_start - self.end = 0 - if line.reference_end is not None: - self.end = line.reference_end - else: - logging.warning("Reference_end for " + str(self) + " was NoneType. Setting to 0.") - - if line.is_reverse: - self.strand = -1 - else: - self.strand = 1 - - def __gt__(self, y): - if self.chrom != y.chrom: - return chrNum(self.chrom) > chrNum(y.chrom) - elif int(self.end) != int(y.end): - return int(self.end) > int(y.end) - else: - return int(self.start) > int(y.start) - - def size(self): - return self.end - self.start + 1 - - def __str__(self): - if len(str(self.info)) == 0: - return "\t".join(map(str, [self.chrom, self.start, self.end])) - elif type(self.info) == list: - return "\t".join(map(str, [self.chrom, self.start, self.end] + list(self.info))) - elif type(self.info) == dict: - return "\t".join( - map(str, [self.chrom, self.start, self.end] + [str(s) + "=" + str(self.info[s]) for s in self.info]) - ) - else: - return "\t".join(map(str, [self.chrom, self.start, self.end, self.info])) - - def gc_content(self): - seq = fa_file.fetch(self.chrom, self.start, self.end) - # if 'G' in seq: - # print seq, seq.count('G'), seq.count('C'), float(seq.count('G') + seq.count('C')) / len(seq) - # exit() - if len(seq) == 0: - return 0.5 - return float(seq.count("G") + seq.count("C") + seq.count("g") + seq.count("c")) / len(seq) - - def sequence(self, new_fa_file=None): - if new_fa_file is not None: - seq = new_fa_file.fetch(self.chrom, self.start, self.end + 1) - else: - seq = fa_file.fetch(self.chrom, self.start, self.end + 1) - if self.strand == 1: - return seq - else: - return "".join([complementary_nucleotide[a] for a in seq][::-1]) - - def intersects(self, n, extend=0, margin=0.0): - if margin > 0.0: - if self.intersects( - interval(n.chrom, n.start, n.end - (1 - margin) * (n.end - n.start)) - ) and self.intersects(interval(n.chrom, n.start + (1 - margin) * (n.end - n.start)), n.end): - return True - else: - s = self - if n.intersects(interval(s.chrom, s.start, s.end - (1 - margin) * (s.end - s.start))) and n.intersects( - interval(s.chrom, s.start + (1 - margin) * (s.end - s.start)), s.end - ): - return True - return False - a = [self.chrom, max(0, self.start - extend), self.end + extend] - b = [n.chrom, n.start, n.end] - if a[0] != b[0]: - return False - if (int(a[1]) - int(b[1])) * (int(a[2]) - int(b[1])) <= 0: - return True - if (int(a[1]) - int(b[2])) * (int(a[2]) - int(b[2])) <= 0: - return True - if (int(a[1]) - int(b[1])) * (int(a[1]) - int(b[2])) <= 0: - return True - if (int(a[2]) - int(b[1])) * (int(a[2]) - int(b[2])) <= 0: - return True - return False - - def intersection(self, y): - if not self.intersects(y): - return None - return interval(self.chrom, max(self.start, y.start), min(self.end, y.end)) - - def merge(self, y, extend=0): - if not self.intersects(y, extend): - return None - return interval(self.chrom, min(self.start, y.start), max(self.end, y.end)) - - def atomize(self, y): - il = interval_list([self, y]) - il.sort() - ilr = [] - if il[0].intersects(il[1]): - ilint = il[0].intersection(il[1]) - if il[0].start < il[1].start: - ilr.append((interval(il[0].chrom, il[0].start, ilint.start - 1), [il[0]])) - elif il[1].start < il[0].start: - ilr.append((interval(il[1].chrom, il[1].start, ilint.start - 1), [il[1]])) - ilr.append((ilint, il)) - if il[0].end > il[1].end: - ilr.append((interval(il[0].chrom, ilint.end + 1, il[0].end), [il[0]])) - elif il[1].end > il[0].end: - ilr.append((interval(il[1].chrom, ilint.end + 1, il[1].end), [il[1]])) - return ilr - else: - return [(il[0], [il[0]]), (il[1], [il[1]])] - - def contains(self, x, y=-1, z=-1): - if type(x) == interval: - if self.intersects(x) and self.intersection(x).size() == x.size(): - return True - else: - return False - if y != -1: - if z == -1: - z = y - if ( - self.intersects(interval(x, y, z)) - and self.intersection(interval(x, y, z)).size() == interval(x, y, z).size() - ): - return True - return False - - def filter_repeat(self): - if len(interval_list([self]).intersection(wgexclude)) > 0: - return True - if len(interval_list([self]).intersection(conserved_regions)) > 0: - return True - if self.rep_content() > 4.5: - return True - return False - - def rep_content(self): - # logging.info("#TIME " + '%.3f\t'%(time() - TSTART) + " rep_content: init ") - if self.chrom == "chrM" or self.chrom == "MT": - return 5.0 - if self.chrom.strip("chr") not in chromList: - return 1.0 - s34 = interval(self.chrom, self.start, max(self.start, self.end - 34)) - # logging.info("#TIME " + '%.3f\t'%(time() - TSTART) + " rep_content: to load duke ") - if duke35_exists[0] and len(duke35) == 0: - try: - duke35file = open(duke35_filename) - duke35.extend([l.strip() for l in duke35file]) - duke35file.close() - except: - logging.warning( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + ' rep_content: Unable to open mapability file "' - + duke35_filename - + '".' - ) - duke35_exists[0] = False - duke35.extend(["chr_Un 0 1 1"]) - # logging.info("#TIME " + '%.3f\t'%(time() - TSTART) + " rep_content: duke loaded") - ictime = 0 - itime = 0 - hi = len(duke35) - 1 - lo = 0 - numiter = 0 - while hi - lo > 1: - numiter += 1 - p = (hi + lo) // 2 - ctime = time() - m = interval(duke35[p]) - ictime += time() - ctime - ctime = time() - if s34.intersects(m) or m > s34: - hi = p - else: - lo = p - itime += time() - ctime - p = lo - m = interval(duke35[p]) - sum_duke = 0 - len_duke = 0 - # logging.info("#TIME " + '%.3f\t'%(time() - TSTART) + " rep_content: found " + str(numiter) + " " + str(ictime) + " " + str(itime)) - while s34 > m or s34.intersects(m): - if not s34.intersects(m): - p += 1 - if p >= len(duke35) or p <= 0: - raise Exception( - "p index out of range: " - + str(p) - + " " - + str(lo) - + " " - + str(self) - + " " - + str(m) - + " " - + str(interval(duke35[lo])) - ) - m = interval(duke35[p]) - continue - repc = 5.0 if float(m.info[0]) == 0 else 1.0 / float(m.info[0]) - sum_duke += s34.intersection(m).size() * repc - len_duke += s34.intersection(m).size() - p += 1 - if p >= len(duke35): - break - m = interval(duke35[p]) - # logging.info("#TIME " + '%.3f\t'%(time() - TSTART) + " rep_content: done") - # exit() - if len_duke > 0: - return sum_duke / len_duke - else: - return 1.0 - - def num_unmasked(self): - if self.chrom not in fa_file.references: - return self.size() - seq = fa_file.fetch(self.chrom, self.start, self.end) - return len([c for c in seq if c in "ACGT"]) - - def segdup_uniqueness(self): - sl = interval_list([self]).intersection(segdup_list) - slsd = sum([self.intersection(i[1]).size() for i in sl]) - return float(self.size()) / (self.size() + slsd) - - def extend(self, extend_len=0): - return interval( - self.chrom, - max(0, self.start - extend_len), - min(self.end + extend_len, chrLen[chrNum(self.chrom)]), - self.strand, - ) - - -class interval_list(list, object): - def __init__(self, ilist=None, file_format=None, sort=True, exclude_info_string=False): - if ilist == None: - ilist = [] - self.file_format = file_format - if file_format in ["bed", "gff"]: - self.bed_to_list(ilist, exclude_info_string=exclude_info_string) - if file_format is None: - list.__init__(self, ilist) - if sort: - self.sort() - self.offset = None - - def bed_to_list(self, file_name, exclude_info_string=False): - if file_name is not None: - try: - f = open(file_name) - list.__init__( - self, - [ - interval(l, file_format=self.file_format, exclude_info_string=exclude_info_string) - for l in f - if len(l.strip().split()) > 2 and l.strip()[0] != "#" - ], - ) - f.close() - except: - logging.error( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + ' interval_list: Unable to open interval file "' - + file_name - + '".' - ) - - def merge_clusters(self, extend=0, margin=0.0): - ml = [] - ci = None - cl = [] - ai = 0 - cend = len(self) - for a in self[::-1]: - ai += 1 - if ci is None or not a.intersects(ci, extend, margin): - cstart = len(self) - ai + 1 - cl = self[cstart:cend] - if ci is not None: - ml.append((ci, cl)) - ci = a - cl = [] - cend = len(self) - ai + 1 - # if ai != sum([len(m[1]) for m in ml]) + 1: - # print "divergent", ai, str(a) - # exit() - ci = ci.merge(a, extend) - # cl.append(a) - cstart = 0 - cl = self[cstart:cend] - if ci is not None: - ml.append((ci, cl)) - return ml[::-1] - - def repeats(self, count=1): - activeq = [] - if activeq is None: - print("h1") - exit() - jinterval = None - ilist = [] - for a in self[::-1]: - while len(activeq) > 0 and not a.intersects(activeq[0][1]): - heapq.heappop(activeq) - if activeq is None: - print("h2") - exit() - if len(activeq) < count and jinterval is not None: - ilist.append((jinterval, copy.copy(aq))) - if activeq is None: - print("h3") - exit() - jinterval = None - heapq.heappush(activeq, (-1 * a.start, a)) - if len(activeq) >= count: - if jinterval is None: - jinterval = interval(a.chrom, activeq[0][1].start, a.end) - aq = copy.copy(activeq) - else: - jinterval.start = min(jinterval.start, activeq[0][1].start) - heapq.heappush(aq, (-1 * a.start, a)) - if jinterval is not None: - ilist.append((jinterval, copy.copy(aq))) - jinterval = None - return ilist[::-1] - - def intersection(self, l2, extend=0): - si = len(self) - 1 - l2i = len(l2) - 1 - sj = len(self) - 1 - l2j = len(l2) - 1 - il = [] - while si >= 0: - while l2i >= 0 and l2[l2i] > self[si] and not self[si].intersects(l2[l2i], extend=extend): - l2i -= 1 - l2j = l2i - while l2j >= 0 and self[si].intersects(l2[l2j], extend=extend): - il.append((self[si], l2[l2j])) - l2j -= 1 - si -= 1 - return il[::-1] - - def atomize(self, h2): - i = 0 - j = 0 - atomlist = [] - if len(self) > 0: - c1 = self[0] - if len(h2) > 0: - c2 = h2[0] - c = None - while i < len(self) or j < len(h2): - # if c is not None: - # print "%%", i, j, str(c[0]), [str(aa) for aa in c[1]], [str(aa[0]) for aa in atomlist] - # else: - # print "%%", i, j, [], [str(aa[0]) for aa in atomlist] - if c is not None: - if i < len(self) and self[i] not in c[1] and (self[i].intersects(c[0], -1) or c[0] > self[i]): - atm = self[i].atomize(c[0]) - atm = [(aa[0], [(lambda x: c[1][0] if x == c[0] else x)(aai) for aai in aa[1]]) for aa in atm] - # print "%i", [len(rr[1]) for rr in atm], [str(rr[0]) for rr in atm] - c = atm[-1] - i += 1 - atomlist += atm[:-1] - elif j < len(h2) and h2[j] not in c[1] and (h2[j].intersects(c[0], -1) or c[0] > h2[j]): - # print j, str(h2[j]), str(c[0]), c[0] > h2[j] - atm = c[0].atomize(h2[j]) - atm = [(aa[0], [(lambda x: c[1][0] if x == c[0] else x)(aai) for aai in aa[1]]) for aa in atm] - # print "%j", [len(rr[1]) for rr in atm], [str(rr[0]) for rr in atm] - c = atm[-1] - j += 1 - atomlist += atm[:-1] - else: - atomlist.append(c) - # if i < len(self) and self[i] in c[1]: - # i += 1 - # if j < len(h2) and h2[j] in c[1]: - # j += 1 - c = None - else: - if i >= len(self): - atomlist.append((h2[j], [h2[j]])) - j += 1 - elif j >= len(h2): - atomlist.append((self[i], [self[i]])) - i += 1 - else: - atm = self[i].atomize(h2[j]) - atomlist += atm[:-1] - c = atm[-1] - # if self[i] not in c[1]: - i += 1 - # if h2[j] not in c[1]: - j += 1 - if c is not None: - atomlist.append(c) - return atomlist - - def get_repeat_content(self): - try: - duke35_file = open(duke35_filename) - print("counting repeats", time()) - self.sort() - sum_duke = [0.0 for i in self] - len_duke = [0.0 for i in self] - lno = 0 - i = 0 - j = 0 - for line in duke35_file: - lno += 1 - duke_int = interval(line) - while not (duke_int.intersects(self[i])) and duke_int > self[i]: - i += 1 - if not duke_int.intersects(self[i]) and self[i] > duke_int: - continue - j = i - repc = 5.0 if float(duke_int.info[0]) == 0 else 1 / float(duke_int.info[0]) - while j < len(self) and self[j].intersects(duke_int): - sum_duke[j] += self[j].intersection(duke_int).size() * repc - len_duke[j] += self[j].intersection(duke_int).size() - j += 1 - duke35_file.close() - return {self[i]: sum_duke[i] / len_duke[i] for i in range(len(interval_list))} - except: - logging.warning( - "#TIME " - + "%.3f\t" % (time() - TSTART) - + ' get_repeat_content: Unable to open mapability file "' - + duke35_filename - + '".' - ) - duke35_exists[0] = False - duke35.extend(["chr_Un 0 1 1"]) - return {self[i]: 1.0 for i in range(len(interval_list))} - - def offsets(self): - if self.offset is not None: - return self.offset - gap = 0.1 - hratio = 0.8 - - vlist = [i for i in self if chrNum(i.chrom) >= 100 and i.chrom[:3] != "chr"] - hlist = [i for i in self if chrNum(i.chrom) < 100 or i.chrom[:3] == "chr"] - v_count = len([i for i in self if chrNum(i.chrom) >= 100 and i.chrom[:3] != "chr"]) - h_count = len(self) - v_count - h_sum = sum([i.size() for i in hlist]) - v_sum = sum([i.size() for i in vlist]) - - hK = len([i for i in hlist if i.size() < h_sum * gap / max(1, h_count)]) - hS = sum([i.size() for i in hlist if i.size() > h_sum * gap / max(1, h_count)]) - min_hsize = hS / (max(1, h_count) / gap - hK) - h_sum = hS + hK * min_hsize - - vK = len([i for i in vlist if i.size() < v_sum * gap / max(1, v_count)]) - vS = sum([i.size() for i in vlist if i.size() > v_sum * gap / max(1, v_count)]) - min_vsize = vS / (max(1, v_count) / gap - vK) - v_sum = vS + vK * min_vsize - - offset = {} - - h_start = 0 - hscale = 1 if v_count == 0 else hratio - v_start = 0 if h_count == 0 else hratio - vscale = 1 if h_count == 0 else (1 - hratio) - - hgap = gap / h_count if h_count > 0 else 0 - vgap = gap / v_count if v_count > 0 else 0 - hpos = h_start + (hgap / 2) * hscale - vpos = v_start + (vgap / 2) * vscale - for i in hlist: - isize = max(i.size(), min_hsize) - offset[i] = (hpos, hpos + ((1 - gap) * isize / h_sum) * hscale) - hpos = hpos + ((1 - gap) * isize / h_sum + hgap) * hscale - for i in vlist: - isize = max(i.size(), min_vsize) - offset[i] = (vpos, vpos + ((1 - gap) * isize / v_sum) * vscale) - vpos = vpos + ((1 - gap) * isize / v_sum + vgap) * vscale - self.offset = offset - # for i in self: - # print str(i), offset[i], i.size(), hgap, h_sum, hscale, gap, hpos, vpos - # exit() - return offset - - def xpos(self, chrom, pos): - offset = self.offsets() - for i in self: - if i.intersects(interval(chrom, max(0, pos - 1), pos)): - o = offset[i] - return (o[1] * (pos - i.start) + o[0] * (i.end - pos)) / (i.end - i.start) - return None - - def offset_breaks(self): - offset = self.offsets() - gap = 0.1 - hratio = 0.8 - - vlist = [i for i in self if chrNum(i.chrom) >= 100 and i.chrom[:3] != "chr"] - hlist = [i for i in self if chrNum(i.chrom) < 100 or i.chrom[:3] == "chr"] - v_count = len([i for i in self if chrNum(i.chrom) >= 100 and i.chrom[:3] != "chr"]) - h_count = len(self) - v_count - h_sum = sum([i.size() for i in hlist]) - v_sum = sum([i.size() for i in vlist]) - - hscale = 1 if v_count == 0 else hratio - vscale = 1 if h_count == 0 else (1 - hratio) - - hgap = gap / h_count if h_count > 0 else 0 - vgap = gap / v_count if v_count > 0 else 0 - - breaks = [] - iprev = None - for i in self: - if iprev is None: - iprev = i - continue - if i in hlist and iprev.chrom == i.chrom: - breaks.append((offset[i][0] - hscale * hgap / 2, ":", i.chrom)) - print(str(i), str(iprev), i in hlist, iprev.chrom == i.chrom) - elif i in hlist and iprev.chrom != i.chrom: - breaks.append((offset[i][0] - hscale * hgap / 2, "--", i.chrom)) - elif i in vlist and iprev in hlist: - breaks.append((offset[i][0] - vscale * vgap / 2, "-", i.chrom)) - elif i in vlist and i.chrom == iprev.chrom: - breaks.append((offset[i][0] - vscale * vgap / 2, ":", i.chrom)) - else: - breaks.append((offset[i][0] - vscale * vgap / 2, "--", i.chrom)) - - iprev = i - return breaks - - def __str__(self): - return str(([str(i) for i in self])) - - -oncogene_list = interval_list(oncogene_filename, "gff") -oncogene_list.sort() -gene_list = interval_list(gene_filename, "gff") - - -exon_list = interval_list([]) - - -def load_exons(): - if len(exon_list) > 0: - return - try: - exon_file = open(exon_filename) - exonFields = [ - interval(j, file_format="gff") - for j in exon_file.read().strip().split("\n") - if ( - len(j.strip()) > 0 - and j.strip()[0] != "#" - and {r.split("=")[0]: r.split("=")[1] for r in j.strip().split()[8].strip(";").split(";")}["color"] - == "000080" - ) - ] - exon_file.close() - exon_list.extend((exonFields)) - except: - logging.warning("#TIME " + "%.3f\t" % (time() - TSTART) + 'unable to load exon file: "' + exon_filename + '"') - - -conserved_regions = interval_list(conserved_regions_filename, "bed") -conserved_regions.sort() - -wgexclude = interval_list(wgexclude_filename, "bed") -wgexclude.sort() - -centromere_list = interval_list(centromere_filename, "bed") -centromere_list.sort() -centromere_list = interval_list([i[0] for i in centromere_list.merge_clusters(extend=1)]) - - -segdup_list = interval_list(segdup_filename, "bed") -segdup_list.sort() diff --git a/bin/repeats.py b/bin/repeats.py old mode 100644 new mode 100755 diff --git a/bin/sample_metadata_skeleton.json b/bin/sample_metadata_skeleton.json deleted file mode 100644 index 9c3e7863..00000000 --- a/bin/sample_metadata_skeleton.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "sample_type": "", - "sample_source": "", - "tissue_of_origin": "", - "sample_description": "" -} \ No newline at end of file diff --git a/bin/simulations.py b/bin/simulations.py old mode 100644 new mode 100755 diff --git a/bin/utils.py b/bin/utils.py old mode 100644 new mode 100755 From c69d62de8eeecb8514e9096f309221b61fd2ae94 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Mon, 3 Jul 2023 16:12:28 +0100 Subject: [PATCH 14/48] Removed due to depracation --- .../ampliconclassifier/ampliconsimilarity.nf | 54 ---------------- .../ampliconclassifier/featuresimilarity.nf | 57 ----------------- modules/local/ampliconclassifier/makeinput.nf | 62 ------------------- 3 files changed, 173 deletions(-) delete mode 100644 modules/local/ampliconclassifier/ampliconsimilarity.nf delete mode 100644 modules/local/ampliconclassifier/featuresimilarity.nf delete mode 100644 modules/local/ampliconclassifier/makeinput.nf diff --git a/modules/local/ampliconclassifier/ampliconsimilarity.nf b/modules/local/ampliconclassifier/ampliconsimilarity.nf deleted file mode 100644 index 612e97ff..00000000 --- a/modules/local/ampliconclassifier/ampliconsimilarity.nf +++ /dev/null @@ -1,54 +0,0 @@ -process AMPLICONCLASSIFIER_AMPLICONSIMILARITY { - tag "AA Amplicons" - label 'process_low' - - conda "bioconda::ampliconclassifier=0.4.14" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ampliconclassifier:0.4.14--hdfd78af_0': - 'quay.io/biocontainers/ampliconclassifier:0.4.14--hdfd78af_0' }" - - input: - path(input) - - output: - path("*_scores.tsv") , emit: scores - path("*") - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - """ - REF=${params.reference_build} - export AA_DATA_REPO=${params.aa_data_repo} - export AA_SRC=${projectDir}/bin - - amplicon_similarity.py \\ - --ref \$REF \\ - $args \\ - --input $input - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) - END_VERSIONS - """ - - stub: - def args = task.ext.args ?: '' - """ - REF=${params.reference_build} - export AA_DATA_REPO=${params.aa_data_repo} - export AA_SRC=${projectDir}/bin - - amplicon_similarity.py --help - touch "ampliconclassifier_similarity_scores.tsv" - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) - END_VERSIONS - """ -} diff --git a/modules/local/ampliconclassifier/featuresimilarity.nf b/modules/local/ampliconclassifier/featuresimilarity.nf deleted file mode 100644 index 805f914a..00000000 --- a/modules/local/ampliconclassifier/featuresimilarity.nf +++ /dev/null @@ -1,57 +0,0 @@ -process AMPLICONCLASSIFIER_FEATURESIMILARITY { - tag "AA Amplicons" - label 'process_low' - - conda "bioconda::ampliconclassifier=0.4.14" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ampliconclassifier:0.4.14--hdfd78af_0': - 'quay.io/biocontainers/ampliconclassifier:0.4.14--hdfd78af_0' }" - - input: - path(input) - - output: - path("*_scores.tsv") , emit: scores - path("*") - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - """ - REF=${params.reference_build} - export AA_DATA_REPO=${params.aa_data_repo} - export AA_SRC=${projectDir}/bin - export AC_SRC=\$(dirname \$(which feature_similarity.py)) - - feature_similarity.py \\ - --ref \$REF \\ - $args \\ - -f $input - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) - END_VERSIONS - """ - - stub: - def args = task.ext.args ?: '' - """ - REF=${params.reference_build} - export AA_DATA_REPO=${params.aa_data_repo} - export AA_SRC=${projectDir}/bin - export AC_SRC=\$(dirname \$(which feature_similarity.py)) -) - - feature_similarity.py --help - touch "ampliconclassifier_similarity_scores.tsv" - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) - END_VERSIONS - """ -} diff --git a/modules/local/ampliconclassifier/makeinput.nf b/modules/local/ampliconclassifier/makeinput.nf deleted file mode 100644 index b75e1b1d..00000000 --- a/modules/local/ampliconclassifier/makeinput.nf +++ /dev/null @@ -1,62 +0,0 @@ -process AMPLICONCLASSIFIER_MAKEINPUT { - tag 'AA Amplicons' - label 'process_low' - - conda "bioconda::ampliconclassifier=0.4.14" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ampliconclassifier:0.4.14--hdfd78af_0': - 'quay.io/biocontainers/ampliconclassifier:0.4.14--hdfd78af_0' }" - - input: - val(id) - path(summary) - - output: - path "*.input" , emit: input - // path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - - """ - # Take vectors as input - vector1=(\$(echo $id | sed 's/\\[//g' | sed 's/, / /g' | sed 's/\\]//g' )) - vector2=(\$(echo $summary | sed 's/\\[//g' | sed 's/, /,/g' )) - - echo \$vector1 - echo \$vector2 - - # Check that vectors are of equal length - if [ \${#vector1[@]} -ne \${#vector2[@]} ]; then - echo "Vectors are not of equal length." - exit 1 - fi - - # Sort the vectors - vector1_sorted=(\$(printf '%s\n' "\${vector1[@]}"|sort)) - vector2_sorted=(\$(printf '%s\n' "\${vector2[@]}"|sort)) - - # Write to file - for index in \${!vector1_sorted[@]}; do - echo \${vector1_sorted[\$index]}\t\${vector2_sorted[\$index]} - done > run_metadata_list.input - -# "${task.process}": -# AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) -# END_VERSIONS - """ - - stub: - def args = task.ext.args ?: '' - """ - touch "ampliconclassifier.input" - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) - END_VERSIONS - """ -} From cd8476122d9b5b3ec534206ac042dbc3229f0fea Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Mon, 3 Jul 2023 16:12:36 +0100 Subject: [PATCH 15/48] Updates --- nextflow.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index f4d13262..8b0f1a2e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -27,7 +27,7 @@ params { save_sorted_bam = false // Circular DNA identification options - circle_identifier = 'circexplorer2' + circle_identifier = null // FASTQC options skip_qc = false @@ -265,7 +265,7 @@ manifest { description = """Pipeline for the identification of circular DNAs""" mainScript = 'main.nf' nextflowVersion = '!>=22.10.1' - version = '1.0.3' + version = '1.0.5dev' doi = '10.5281/zenodo.7712010' } From f1e48a07c1210ebabe40d6d517112bf0268bacd3 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Mon, 3 Jul 2023 16:17:00 +0100 Subject: [PATCH 16/48] updates --- modules/local/ampliconarchitect/ampliconarchitect.nf | 4 ---- 1 file changed, 4 deletions(-) diff --git a/modules/local/ampliconarchitect/ampliconarchitect.nf b/modules/local/ampliconarchitect/ampliconarchitect.nf index 4abffbb4..6642d8eb 100644 --- a/modules/local/ampliconarchitect/ampliconarchitect.nf +++ b/modules/local/ampliconarchitect/ampliconarchitect.nf @@ -2,10 +2,6 @@ process AMPLICONARCHITECT_AMPLICONARCHITECT { tag "$meta.id" label 'process_low' - // conda "conda-forge::python=2.7 bioconda::pysam=0.15.2 anaconda::flask=1.1.2 anaconda::cython=0.29.14 anaconda::numpy=1.16.6 anaconda::scipy=1.2.1 conda-forge::matplotlib=2.2.5 mosek::mosek=8.0.60 anaconda::future=0.18.2" - // container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - // 'https://depot.galaxyproject.org/singularity/mulled-v2-6eefa51f13933d65b4f8155ca2f8cd81dea474ba:baa777f7c4e89a2ec4d1eab7d424a1f46503bac7-0': - // 'quay.io/biocontainers/mulled-v2-6eefa51f13933d65b4f8155ca2f8cd81dea474ba:baa777f7c4e89a2ec4d1eab7d424a1f46503bac7-0' }" conda "bioconda::ampliconsuite=0.1555.2 mosek::mosek=10.1b1" container '/home/local/BICR/dschreye/ampliconsuite.sif' From 7483cbe3cf8f0a785c52360bbaf01a03b1506cb0 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Fri, 7 Jul 2023 09:08:13 +0100 Subject: [PATCH 17/48] Updated Tag --- .github/workflows/build-docker-image.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-docker-image.yml b/.github/workflows/build-docker-image.yml index f9206fd2..40d4b515 100644 --- a/.github/workflows/build-docker-image.yml +++ b/.github/workflows/build-docker-image.yml @@ -24,4 +24,4 @@ jobs: with: file: modules/local/ampliconsuite/Dockerfile push: true - tags: "quay.io/nf-core/prepareaa:latest" + tags: "quay.io/nf-core/prepareaa:1.0.0" From 74cd8fe6863ec40abda64ae839e985100125e5d7 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Wed, 12 Jul 2023 14:31:56 +0100 Subject: [PATCH 18/48] Updated with new variables for ampliconarchitect modules --- .../ampliconarchitect/ampliconarchitect.nf | 18 +- .../ampliconclassifier/ampliconclassifier.nf | 16 +- modules/local/ampliconclassifier/circdna.nf | 607 ++++++++++++++++++ .../ampliconclassifier/makeresultstable.nf | 63 -- modules/local/ampliconsuite/prepareaa.nf | 16 +- 5 files changed, 648 insertions(+), 72 deletions(-) create mode 100644 modules/local/ampliconclassifier/circdna.nf delete mode 100644 modules/local/ampliconclassifier/makeresultstable.nf diff --git a/modules/local/ampliconarchitect/ampliconarchitect.nf b/modules/local/ampliconarchitect/ampliconarchitect.nf index 6642d8eb..a79daeeb 100644 --- a/modules/local/ampliconarchitect/ampliconarchitect.nf +++ b/modules/local/ampliconarchitect/ampliconarchitect.nf @@ -3,7 +3,7 @@ process AMPLICONARCHITECT_AMPLICONARCHITECT { label 'process_low' conda "bioconda::ampliconsuite=0.1555.2 mosek::mosek=10.1b1" - container '/home/local/BICR/dschreye/ampliconsuite.sif' + container 'quay.io/nf-core/prepareaa:1.0.0' input: tuple val(meta), path(bam), path(bai), path(bed) @@ -26,8 +26,20 @@ process AMPLICONARCHITECT_AMPLICONARCHITECT { """ export AA_DATA_REPO=${params.aa_data_repo} export MOSEKLM_LICENSE_FILE=${params.mosek_license_dir} - export AA_SRC=\$(dirname \$(readlink -f \$(which AmpliconArchitect.py))) - export AC_SRC=\$(dirname \$(readlink -f \$(which amplicon_classifier.py))) + + # Define Variables AA_SRC and AC_SRC + if ! command -v AmpliconArchitect.py &> /dev/null; then + export AA_SRC=\$(dirname \$(python -c "import ampliconarchitectlib; print(ampliconarchitectlib.__file__)") + else + export AA_SRC=\$(dirname \$(readlink -f \$(which AmpliconArchitect.py))) + fi + + if ! command -v amplicon_classifier.py &> /dev/null; then + export AC_SRC=\$(dirname \$(python -c "import ampliconclassifierlib; print(ampliconclassifierlib.__file__)") + else + export AC_SRC=\$(dirname \$(readlink -f \$(which amplicon_classifier.py))) + fi + REF=${params.reference_build} AmpliconSuite-pipeline.py \\ diff --git a/modules/local/ampliconclassifier/ampliconclassifier.nf b/modules/local/ampliconclassifier/ampliconclassifier.nf index 053f7214..ac0a4aae 100644 --- a/modules/local/ampliconclassifier/ampliconclassifier.nf +++ b/modules/local/ampliconclassifier/ampliconclassifier.nf @@ -3,7 +3,7 @@ process AMPLICONCLASSIFIER_AMPLICONCLASSIFIER { label 'process_low' conda "bioconda::ampliconsuite=0.1555.2 mosek::mosek=10.1b1" - container '/home/local/BICR/dschreye/ampliconsuite.sif' + container 'quay.io/nf-core/prepareaa:1.0.0' input: path (graphs) @@ -20,8 +20,18 @@ process AMPLICONCLASSIFIER_AMPLICONCLASSIFIER { """ export AA_DATA_REPO=${params.aa_data_repo} - export AA_SRC=\$(dirname \$(readlink -f \$(which AmpliconArchitect.py))) - export AC_SRC=\$(dirname \$(readlink -f \$(which amplicon_classifier.py))) + # Define Variables AA_SRC and AC_SRC + if ! command -v AmpliconArchitect.py &> /dev/null; then + export AA_SRC=\$(dirname \$(python -c "import ampliconarchitectlib; print(ampliconarchitectlib.__file__)") + else + export AA_SRC=\$(dirname \$(readlink -f \$(which AmpliconArchitect.py))) + fi + + if ! command -v amplicon_classifier.py &> /dev/null; then + export AC_SRC=\$(dirname \$(python -c "import ampliconclassifierlib; print(ampliconclassifierlib.__file__)") + else + export AC_SRC=\$(dirname \$(readlink -f \$(which amplicon_classifier.py))) + fi AmpliconSuite-pipeline.py \\ -s $prefix \\ diff --git a/modules/local/ampliconclassifier/circdna.nf b/modules/local/ampliconclassifier/circdna.nf new file mode 100644 index 00000000..34e03ff8 --- /dev/null +++ b/modules/local/ampliconclassifier/circdna.nf @@ -0,0 +1,607 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + VALIDATE INPUTS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + + +def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) + +// Validate input parameters +WorkflowCircdna.initialise(params, log) + +// Check input path parameters to see if they exist +def checkPathParamList = [ params.input, params.multiqc_config, params.fasta ] +for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } + +// Check mandatory parameters +if (params.input) { ch_input = Channel.fromPath(params.input) } else { exit 1, 'Input samplesheet not specified!' } +if (params.fasta) { ch_fasta = Channel.fromPath(params.fasta) } else { exit 1, 'Fasta reference genome not specified!' } + +if (!(params.input_format == "FASTQ" | params.input_format == "BAM")) { + exit 1, 'Please specifiy --input_format "FASTQ" or "BAM" in capital letters, depending on the input file format.' +} + +// Modify fasta channel to include meta data +ch_fasta_meta = ch_fasta.map{ it -> [[id:it[0].baseName], it] }.collect() + +branch = params.circle_identifier.split(",") +run_circexplorer2 = ("circexplorer2" in branch) +run_circle_map_realign = ("circle_map_realign" in branch) +run_circle_map_repeats = ("circle_map_repeats" in branch) +run_circle_finder = ("circle_finder" in branch) +run_ampliconarchitect = ("ampliconarchitect" in branch) +run_unicycler = ("unicycler" in branch) + +if (!(run_unicycler | run_circle_map_realign | run_circle_map_repeats | run_circle_finder | run_ampliconarchitect | run_circexplorer2)) { + exit 1, 'circle_identifier param not valid. Please check!' +} + +if (run_unicycler && !params.input_format == "FASTQ") { + exit 1, 'Unicycler needs FastQ input. Please specify input_format == "FASTQ", if possible, or don`t run unicycler.' +} + +if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } + +// Check if BWA Index is given +if (params.bwa_index) { + ch_bwa_index = Channel.fromPath(params.bwa_index).collect() + bwa_index_exists = true + } else { + ch_bwa_index = Channel.empty() + bwa_index_exists = false + } + +// AMPLICON ARCHITECT INPUT +if (run_ampliconarchitect) { + mosek_license_dir = file(params.mosek_license_dir) + if (!mosek_license_dir.exists()) { + exit 1, "Mosek License Directory is missing! Please specifiy directory containing mosek license using --mosek_license_dir and rename license to 'mosek.lic'." + } + if (!params.aa_data_repo) { exit 1, "AmpliconArchitect Data Repository Missing! Please see https://github.com/jluebeck/AmpliconArchitect for more information and specify its absolute path using --aa_data_repo." } + if (params.reference_build != "hg19" & params.reference_build != "GRCh38" & params.reference_build != "GRCh37" & params.reference_build != "mm10"){ + exit 1, "Reference Build not given! Please specify --reference_build 'mm10', 'hg19', 'GRCh38', or 'GRCh37'." + } + + if (!params.cnvkit_cnn) { + ch_cnvkit_reference = file(params.aa_data_repo + "/" + params.reference_build + "/" + params.reference_build + "_cnvkit_filtered_ref.cnn", checkIfExists: true) + } else { + ch_cnvkit_reference = file(params.cnvkit_cnn) + } +} + + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + CONFIG FILES +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) +ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config, checkIfExists: true ) : Channel.empty() +ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() +ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT LOCAL MODULES/SUBWORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// +// SUBWORKFLOW: Consisting of a mix of local and nf-core/modules +// +include { INPUT_CHECK } from '../subworkflows/local/input_check' + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT NF-CORE MODULES/SUBWORKFLOWS & LOCAL MODULES/SUBWORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' + + +// CONCATENATE FASTQ +include { CAT_FASTQ } from '../modules/nf-core/cat/fastq/main' + +// QUALITY CONTROL +include { FASTQC } from '../modules/nf-core/fastqc/main' + +// TRIMMING +include { TRIMGALORE } from '../modules/nf-core/trimgalore/main' + +// Genome Preparation +include { BWA_INDEX } from '../modules/nf-core/bwa/index/main' + +// Alignment +include { BWA_MEM } from '../modules/local/bwa/mem/main' +include { SAMTOOLS_SORT as SAMTOOLS_SORT_BAM } from '../modules/nf-core/samtools/sort/main' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_BAM } from '../modules/nf-core/samtools/index/main' + +// PICARD +include { SAMTOOLS_FAIDX } from '../modules/nf-core/samtools/faidx/main' +include { BAM_MARKDUPLICATES_PICARD } from '../subworkflows/nf-core/bam_markduplicates_picard/main' +include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_FILTER } from '../modules/nf-core/samtools/view/main' +include { SAMTOOLS_SORT as SAMTOOLS_SORT_FILTERED } from '../modules/nf-core/samtools/sort/main' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_FILTERED } from '../modules/nf-core/samtools/index/main' + +// BAM STATS +include { BAM_STATS_SAMTOOLS } from '../subworkflows/nf-core/bam_stats_samtools/main' + +// CIRCLE-MAP +include { CIRCLEMAP_READEXTRACTOR } from '../modules/local/circlemap/readextractor.nf' +include { SAMTOOLS_SORT as SAMTOOLS_SORT_RE } from '../modules/nf-core/samtools/sort/main' +include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_RE } from '../modules/nf-core/samtools/index/main' +include { SAMTOOLS_SORT as SAMTOOLS_SORT_QNAME_CM } from '../modules/nf-core/samtools/sort/main' +include { CIRCLEMAP_REALIGN } from '../modules/local/circlemap/realign.nf' +include { CIRCLEMAP_REPEATS } from '../modules/local/circlemap/repeats.nf' + +// CIRCLE_FINDER +include { SAMTOOLS_SORT as SAMTOOLS_SORT_QNAME_CF } from '../modules/nf-core/samtools/sort/main' +include { SAMBLASTER } from '../modules/local/samblaster.nf' +include { BEDTOOLS_SORTEDBAM2BED } from '../modules/local/bedtools/sortedbam2bed.nf' +include { BEDTOOLS_SPLITBAM2BED } from '../modules/local/bedtools/splitbam2bed.nf' +include { CIRCLEFINDER } from '../modules/local/circlefinder.nf' + +// CIRCexplorer2 +include { CIRCEXPLORER2_PARSE } from '../modules/local/circexplorer2/parse.nf' + +// AmpliconArchitect +include { CNVKIT_BATCH } from '../modules/local/cnvkit/batch/main.nf' +include { CNVKIT_SEGMENT } from '../modules/local/cnvkit/segment.nf' +include { PREPAREAA } from '../modules/local/ampliconsuite/prepareaa.nf' +include { COLLECT_SEEDS } from '../modules/local/collect_seeds.nf' +include { AMPLIFIED_INTERVALS } from '../modules/local/amplified_intervals.nf' +include { AMPLICONARCHITECT_AMPLICONARCHITECT } from '../modules/local/ampliconarchitect/ampliconarchitect.nf' +include { AMPLICONCLASSIFIER_AMPLICONCLASSIFIER } from '../modules/local/ampliconclassifier/ampliconclassifier.nf' + +// Unicycler +include { UNICYCLER } from '../modules/local/unicycler/main.nf' +include { SEQTK_SEQ } from '../modules/local/seqtk/seq.nf' +include { GETCIRCULARREADS } from '../modules/local/getcircularreads.nf' +include { MINIMAP2_ALIGN } from '../modules/nf-core/minimap2/align/main.nf' + + +// MULTIQC +include { MULTIQC } from '../modules/local/multiqc.nf' + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + RUN MAIN WORKFLOW +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// Info required for completion email and summary +def multiqc_report = [] + +workflow CIRCDNA { + ch_versions = Channel.empty() + + // Define Empty Channels for MultiQC + ch_samtools_stats = Channel.empty() + ch_samtools_flagstat = Channel.empty() + ch_samtools_idxstats = Channel.empty() + ch_markduplicates_stats = Channel.empty() + ch_markduplicates_flagstat = Channel.empty() + ch_markduplicates_idxstats = Channel.empty() + ch_markduplicates_multiqc = Channel.empty() + + // Check file format + if (params.input_format == "FASTQ") { + // + // SUBWORKFLOW: Read in samplesheet, validate and stage input files + // + INPUT_CHECK ( + ch_input + ) + .reads + .map { + meta, fastq -> + meta.id = meta.id.split('_')[0..-2].join('_') + [ meta, fastq ] } + .groupTuple(by: [0]) + .branch { + meta, fastq -> + single : fastq.size() == 1 + return [ meta, fastq.flatten() ] + multiple: fastq.size() > 1 + return [ meta, fastq.flatten() ] + } + .set { ch_fastq } + ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) + + // + // MODULE: Concatenate FASTQs from the same samples + // + CAT_FASTQ ( + ch_fastq.multiple + ) + .reads + .mix(ch_fastq.single) + .set { ch_cat_fastq } + + ch_versions = ch_versions.mix(CAT_FASTQ.out.versions) + + + // + // MODULE: Run FastQC + // + ch_fastqc_multiqc = Channel.empty() + if ( ! params.skip_qc ) { + FASTQC ( + ch_cat_fastq + ) + ch_versions = ch_versions.mix(FASTQC.out.versions) + ch_fastqc_multiqc = FASTQC.out.zip + } + + // + // MODULE: Run trimgalore + // + if ( ! params.skip_trimming ) { + TRIMGALORE ( + ch_cat_fastq + ) + ch_trimmed_reads = TRIMGALORE.out.reads + ch_trimgalore_multiqc = TRIMGALORE.out.zip + ch_trimgalore_multiqc_log = TRIMGALORE.out.log + ch_versions = ch_versions.mix(TRIMGALORE.out.versions) + } else { + ch_trimmed_reads = INPUT_CHECK.out.reads + ch_trimgalore_multiqc = Channel.empty() + ch_trimgalore_multiqc_log = Channel.empty() + } + + // + // MODULE: Run bwa index + // + if (!bwa_index_exists & (run_ampliconarchitect | run_circexplorer2 | + run_circle_finder | run_circle_map_realign | + run_circle_map_repeats)) { + BWA_INDEX ( + ch_fasta_meta + ) + ch_bwa_index = BWA_INDEX.out.index.map{ meta, index -> ["bwa_index", index] }.collect() + ch_versions = ch_versions.mix(BWA_INDEX.out.versions) + } + + + // + // MODULE: BWA MEM ALIGNMENT + // + if (run_ampliconarchitect | run_circexplorer2 | run_circle_finder | + run_circle_map_realign | run_circle_map_repeats) { + BWA_MEM ( + ch_trimmed_reads, + ch_bwa_index, + Channel.value(true) + ) + ch_bam_sorted = BWA_MEM.out.bam + ch_full_bam_sorted = BWA_MEM.out.bam + ch_bwa_sorted = BWA_MEM.out.bam + ch_versions = ch_versions.mix(BWA_MEM.out.versions) + + // SAMTOOLS INDEX SORTED BAM + SAMTOOLS_INDEX_BAM ( + ch_bam_sorted + ) + ch_versions = ch_versions.mix(SAMTOOLS_INDEX_BAM.out.versions) + } + } else if (params.input_format == "BAM") { + // Use BAM Files as input + INPUT_CHECK ( + ch_input + ) + if (!params.bam_sorted){ + SAMTOOLS_SORT_BAM ( + INPUT_CHECK.out.reads + ) + ch_versions = ch_versions.mix(SAMTOOLS_SORT_BAM.out.versions) + ch_bam_sorted = SAMTOOLS_SORT_BAM.out.bam + } else { + ch_bam_sorted = INPUT_CHECK.out.reads + ch_full_bam_sorted = INPUT_CHECK.out.reads + ch_bwa_sorted = INPUT_CHECK.out.reads + } + // SAMTOOLS INDEX SORTED BAM + SAMTOOLS_INDEX_BAM ( + ch_bam_sorted + ) + ch_versions = ch_versions.mix(SAMTOOLS_INDEX_BAM.out.versions) + ch_fastqc_multiqc = Channel.empty() + ch_trimgalore_multiqc = Channel.empty() + ch_trimgalore_multiqc_log = Channel.empty() + } + + + + + if (run_ampliconarchitect | run_circexplorer2 | run_circle_finder | + run_circle_map_realign | run_circle_map_repeats) { + + // Define Index channel and additional bam sorted channels for Circle_finder - not usable with duplicates removed + ch_bam_sorted_bai = SAMTOOLS_INDEX_BAM.out.bai + ch_full_bam_sorted = ch_bam_sorted + ch_full_bam_sorted_bai = SAMTOOLS_INDEX_BAM.out.bai + + ch_fasta = ch_fasta_meta.map{ meta, index -> [index] }.collect() + + // Stub run is not yet implemented into BAM_STATS_SAMTOOLS subworkflow -> Will be skipped when stub is active + if (!workflow.stubRun) { + BAM_STATS_SAMTOOLS ( + ch_bam_sorted.join(ch_bam_sorted_bai). + map { meta, bam, bai -> [meta, bam, bai] }, + ch_fasta_meta + ) + ch_versions = ch_versions.mix(BAM_STATS_SAMTOOLS.out.versions) + ch_samtools_stats = BAM_STATS_SAMTOOLS.out.stats + ch_samtools_flagstat = BAM_STATS_SAMTOOLS.out.flagstat + ch_samtools_idxstats = BAM_STATS_SAMTOOLS.out.idxstats + } + + // PICARD MARK_DUPLICATES + if (!params.skip_markduplicates) { + // Index Fasta File for Markduplicates + SAMTOOLS_FAIDX ( + ch_fasta_meta, + [[], []] + ) + + // MARK DUPLICATES IN BAM FILE + BAM_MARKDUPLICATES_PICARD ( + ch_bam_sorted, + ch_fasta_meta, + SAMTOOLS_FAIDX.out.fai.collect() + ) + + // FILTER DUPLICATES IN BAM FILES USING SAMTOOLS VIEW + if (!params.keep_duplicates) { + SAMTOOLS_VIEW_FILTER ( + ch_bam_sorted.join(ch_bam_sorted_bai), + ch_fasta_meta, + [] + ) + ch_versions = ch_versions.mix(SAMTOOLS_VIEW_FILTER.out.versions) + + // SORT FILTERED BAM FILE + SAMTOOLS_SORT_FILTERED ( + SAMTOOLS_VIEW_FILTER.out.bam + ) + ch_versions = ch_versions.mix(SAMTOOLS_SORT_FILTERED.out.versions) + + // INDEX FILTERED BAM FILE + SAMTOOLS_INDEX_FILTERED ( + SAMTOOLS_SORT_FILTERED.out.bam + ) + + ch_bam_sorted = SAMTOOLS_SORT_FILTERED.out.bam + ch_bam_sorted_bai = SAMTOOLS_INDEX_FILTERED.out.bai + ch_versions = ch_versions.mix(SAMTOOLS_INDEX_FILTERED.out.versions) + } + else { + ch_bam_sorted = BAM_MARKDUPLICATES_PICARD.out.bam + ch_bam_sorted_bai = BAM_MARKDUPLICATES_PICARD.out.bai + ch_markduplicates_stats = BAM_MARKDUPLICATES_PICARD.out.stats + ch_markduplicates_flagstat = BAM_MARKDUPLICATES_PICARD.out.flagstat + ch_markduplicates_idxstats = BAM_MARKDUPLICATES_PICARD.out.idxstats + ch_markduplicates_multiqc = BAM_MARKDUPLICATES_PICARD.out.metrics + ch_versions = ch_versions.mix(BAM_MARKDUPLICATES_PICARD.out.versions) + } + } else { + ch_markduplicates_stats = Channel.empty() + ch_markduplicates_flagstat = Channel.empty() + ch_markduplicates_idxstats = Channel.empty() + ch_markduplicates_multiqc = Channel.empty() + } + } + + if (run_ampliconarchitect) { + PREPAREAA ( + ch_bam_sorted + ) + ch_versions = ch_versions.mix(PREPAREAA.out.versions) + + AMPLICONARCHITECT_AMPLICONARCHITECT ( + ch_bam_sorted.join(ch_bam_sorted_bai). + join(PREPAREAA.out.bed) + ) + ch_versions = ch_versions.mix(AMPLICONARCHITECT_AMPLICONARCHITECT.out.versions) + + ch_aa_cycles = AMPLICONARCHITECT_AMPLICONARCHITECT.out.cycles. + map {meta, path -> [path]} + ch_aa_graphs = AMPLICONARCHITECT_AMPLICONARCHITECT.out.graph. + map {meta, path -> [path]} + ch_aa_cnseg = AMPLICONARCHITECT_AMPLICONARCHITECT.out.cnseg. + map {meta, path -> [path]} + + AMPLICONCLASSIFIER_AMPLICONCLASSIFIER ( + ch_aa_graphs.flatten().collect().ifEmpty([]), + ch_aa_cycles.flatten().collect().ifEmpty([]), + ch_aa_cnseg.flatten().collect().ifEmpty([]) + ) + ch_versions = ch_versions.mix(AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.versions) + } + + + // + // SUBWORKFLOW - RUN CIRCLE_FINDER PIPELINE + // + if (run_circle_finder) { + SAMTOOLS_SORT_QNAME_CF ( + ch_full_bam_sorted + ) + ch_versions = ch_versions.mix(SAMTOOLS_SORT_QNAME_CF.out.versions) + + SAMBLASTER ( + SAMTOOLS_SORT_QNAME_CF.out.bam + ) + ch_versions = ch_versions.mix(SAMBLASTER.out.versions) + + BEDTOOLS_SPLITBAM2BED ( + SAMBLASTER.out.split_bam + ) + ch_versions = ch_versions.mix(BEDTOOLS_SPLITBAM2BED.out.versions) + + BEDTOOLS_SORTEDBAM2BED ( + ch_full_bam_sorted.join(ch_full_bam_sorted_bai) + ) + ch_versions = ch_versions.mix(BEDTOOLS_SORTEDBAM2BED.out.versions) + + ch_b2b_sorted = BEDTOOLS_SORTEDBAM2BED.out.conc_txt + ch_b2b_split = BEDTOOLS_SPLITBAM2BED.out.split_txt + CIRCLEFINDER ( + ch_b2b_split.join(ch_b2b_sorted) + ) + } + + // + // SUBWORKFLOW: RUN CIRCLE-MAP REALIGN or REPEATS PIPELINE + // + if (run_circle_map_realign || + run_circle_map_repeats) { + SAMTOOLS_SORT_QNAME_CM ( + ch_bam_sorted + ) + ch_versions = ch_versions.mix(SAMTOOLS_SORT_QNAME_CM.out.versions) + + CIRCLEMAP_READEXTRACTOR ( + SAMTOOLS_SORT_QNAME_CM.out.bam + ) + ch_versions = ch_versions.mix(CIRCLEMAP_READEXTRACTOR.out.versions) + + SAMTOOLS_SORT_RE ( + CIRCLEMAP_READEXTRACTOR.out.bam + ) + ch_versions = ch_versions.mix(SAMTOOLS_SORT_RE.out.versions) + + SAMTOOLS_INDEX_RE ( + SAMTOOLS_SORT_RE.out.bam + ) + ch_versions = ch_versions.mix(SAMTOOLS_INDEX_RE.out.versions) + + // DEFINE CHANNELS FOR REALIGN AND REPEATS + ch_qname_sorted_bam = SAMTOOLS_SORT_QNAME_CM.out.bam + ch_re_sorted_bam = SAMTOOLS_SORT_RE.out.bam + ch_re_sorted_bai = SAMTOOLS_INDEX_RE.out.bai + + // + // MODULE: RUN CIRCLE_MAP REPEATS + // + if (run_circle_map_repeats) { + CIRCLEMAP_REPEATS ( + ch_re_sorted_bam.join(ch_re_sorted_bai) + ) + ch_versions = ch_versions.mix(CIRCLEMAP_REPEATS.out.versions) + } + + // + // MODULE: Run Circle-Map Realign + // + if (run_circle_map_realign) { + + CIRCLEMAP_REALIGN ( + ch_re_sorted_bam.join(ch_re_sorted_bai). + join(ch_qname_sorted_bam). + join(ch_bam_sorted). + join(ch_bam_sorted_bai), + ch_fasta + ) + ch_versions = ch_versions.mix(CIRCLEMAP_REALIGN.out.versions) + } + } + + + if (run_circexplorer2) { + CIRCEXPLORER2_PARSE ( + ch_bam_sorted.join(ch_bam_sorted_bai) + ) + ch_versions = ch_versions.mix(CIRCEXPLORER2_PARSE.out.versions) + } + + if (run_unicycler && params.input_format == "FASTQ") { + + UNICYCLER ( + ch_trimmed_reads + ) + ch_versions = ch_versions.mix(UNICYCLER.out.versions) + + SEQTK_SEQ ( + UNICYCLER.out.scaffolds + ) + ch_versions = ch_versions.mix(SEQTK_SEQ.out.versions) + + GETCIRCULARREADS ( + SEQTK_SEQ.out.fastq + ) + + GETCIRCULARREADS.out.fastq + .map { meta, fastq -> [ meta + [single_end: true], fastq ] } + .set { ch_circular_fastq } + + MINIMAP2_ALIGN ( + ch_circular_fastq, + ch_fasta, + false, + false, + false + ) + ch_versions = ch_versions.mix(MINIMAP2_ALIGN.out.versions) + } + + // + // MODULE: Pipeline reporting + // + CUSTOM_DUMPSOFTWAREVERSIONS ( + ch_versions.unique().collectFile(name: 'collated_versions.yml') + ) + + // + // MODULE: MultiQC + // + if (!params.skip_multiqc) { + workflow_summary = WorkflowCircdna.paramsSummaryMultiqc(workflow, summary_params) + ch_workflow_summary = Channel.value(workflow_summary) + + MULTIQC ( + ch_multiqc_config, + ch_multiqc_custom_config.collect().ifEmpty([]), + CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect(), + ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml'), + ch_fastqc_multiqc.collect{it[1]}.ifEmpty([]), + ch_trimgalore_multiqc.collect{it[1]}.ifEmpty([]), + ch_trimgalore_multiqc_log.collect{it[1]}.ifEmpty([]), + ch_samtools_stats.collect{it[1]}.ifEmpty([]), + ch_samtools_flagstat.collect{it[1]}.ifEmpty([]), + ch_samtools_idxstats.collect{it[1]}.ifEmpty([]), + ch_markduplicates_flagstat.collect{it[1]}.ifEmpty([]), + ch_markduplicates_stats.collect{it[1]}.ifEmpty([]), + ch_markduplicates_idxstats.collect{it[1]}.ifEmpty([]), + ch_markduplicates_multiqc.collect{it[1]}.ifEmpty([]), + ) + multiqc_report = MULTIQC.out.report.toList() + } +} + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + COMPLETION EMAIL AND SUMMARY +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + + +workflow.onComplete { + if (params.email || params.email_on_fail) { + NfcoreTemplate.email(workflow, params, summary_params, projectDir, log, multiqc_report) + } + NfcoreTemplate.summary(workflow, params, log) + if (params.hook_url) { + NfcoreTemplate.IM_notification(workflow, params, summary_params, projectDir, log) + } +} + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + THE END +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ diff --git a/modules/local/ampliconclassifier/makeresultstable.nf b/modules/local/ampliconclassifier/makeresultstable.nf deleted file mode 100644 index facb00f3..00000000 --- a/modules/local/ampliconclassifier/makeresultstable.nf +++ /dev/null @@ -1,63 +0,0 @@ -process AMPLICONCLASSIFIER_MAKERESULTSTABLE { - tag 'AA Amplicons' - label 'process_low' - - conda "bioconda::ampliconclassifier=0.4.14" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ampliconclassifier:0.4.14--hdfd78af_0': - 'quay.io/biocontainers/ampliconclassifier:0.4.14--hdfd78af_0' }" - - input: - path (metadata) - path (class_file) - path (gene_list) - path (feature_entropy) - path (basic_properties) - path (bed_files) - - output: - path "*result_data.json" , emit: json - path "*result_table.tsv" , emit: tsv - path "index.html" , emit: html - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - """ - export AA_DATA_REPO=${params.aa_data_repo} - REF=${params.reference_build} - - # Create subdirectories in working directory - mkdir ampliconclassifier_classification_bed_files - mv $bed_files ampliconclassifier_classification_bed_files/ - - make_results_table.py \\ - $args \\ - --run_metadata_list $metadata \\ - --classification_file $class_file - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) - END_VERSIONS - """ - - stub: - def args = task.ext.args ?: '' - - """ - make_results_table.py --help - - touch ampliconclasifier_result_data.json - touch ampliconclasifier_result_table.tsv - touch index.html - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - AmpliconClassifier: \$(echo \$(amplicon_classifier.py --version | sed 's/amplicon_classifier //g' | sed 's/ .*//g')) - END_VERSIONS - """ -} diff --git a/modules/local/ampliconsuite/prepareaa.nf b/modules/local/ampliconsuite/prepareaa.nf index e2251f3a..08c24b7f 100644 --- a/modules/local/ampliconsuite/prepareaa.nf +++ b/modules/local/ampliconsuite/prepareaa.nf @@ -3,7 +3,7 @@ process PREPAREAA { label 'process_low' conda "bioconda::ampliconsuite=0.1555.2 mosek::mosek=10.1b1" - container '/home/local/BICR/dschreye/src/AmpliconSuite-pipeline/docker/test/ampliconsuite.img' + container 'quay.io/nf-core/prepareaa:1.0.0' input: tuple val(meta), path(bam) @@ -27,8 +27,18 @@ process PREPAREAA { """ export AA_DATA_REPO=${params.aa_data_repo} export MOSEKLM_LICENSE_FILE=${params.mosek_license_dir} - export AA_SRC=\$(dirname \$(readlink -f \$(which AmpliconArchitect.py))) - export AC_SRC=\$(dirname \$(readlink -f \$(which amplicon_classifier.py))) + # Define Variables AA_SRC and AC_SRC + if ! command -v AmpliconArchitect.py &> /dev/null; then + export AA_SRC=\$(dirname \$(python -c "import ampliconarchitectlib; print(ampliconarchitectlib.__file__)") + else + export AA_SRC=\$(dirname \$(readlink -f \$(which AmpliconArchitect.py))) + fi + + if ! command -v amplicon_classifier.py &> /dev/null; then + export AC_SRC=\$(dirname \$(python -c "import ampliconclassifierlib; print(ampliconclassifierlib.__file__)") + else + export AC_SRC=\$(dirname \$(readlink -f \$(which amplicon_classifier.py))) + fi REF=${params.reference_build} AmpliconSuite-pipeline.py \\ From 032ea60a23e7bd06fe5d706deb2d3cfcb0e9bc1c Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Wed, 12 Jul 2023 14:32:09 +0100 Subject: [PATCH 19/48] Fixes --- workflows/circdna.nf | 4 ---- 1 file changed, 4 deletions(-) diff --git a/workflows/circdna.nf b/workflows/circdna.nf index 452203c9..34e03ff8 100644 --- a/workflows/circdna.nf +++ b/workflows/circdna.nf @@ -155,10 +155,6 @@ include { COLLECT_SEEDS } from '../modules/local include { AMPLIFIED_INTERVALS } from '../modules/local/amplified_intervals.nf' include { AMPLICONARCHITECT_AMPLICONARCHITECT } from '../modules/local/ampliconarchitect/ampliconarchitect.nf' include { AMPLICONCLASSIFIER_AMPLICONCLASSIFIER } from '../modules/local/ampliconclassifier/ampliconclassifier.nf' -include { AMPLICONCLASSIFIER_AMPLICONSIMILARITY } from '../modules/local/ampliconclassifier/ampliconsimilarity.nf' -include { AMPLICONCLASSIFIER_FEATURESIMILARITY } from '../modules/local/ampliconclassifier/featuresimilarity.nf' -include { AMPLICONCLASSIFIER_MAKEINPUT } from '../modules/local/ampliconclassifier/makeinput.nf' -include { AMPLICONCLASSIFIER_MAKERESULTSTABLE } from '../modules/local/ampliconclassifier/makeresultstable.nf' // Unicycler include { UNICYCLER } from '../modules/local/unicycler/main.nf' From d015b5e9b901772a8994d39a5b2f2ef7444d77d8 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Wed, 12 Jul 2023 15:09:11 +0100 Subject: [PATCH 20/48] Changed AA variable command --- modules/local/ampliconarchitect/ampliconarchitect.nf | 5 ++--- modules/local/ampliconclassifier/ampliconclassifier.nf | 5 ++--- modules/local/ampliconsuite/prepareaa.nf | 8 ++++---- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/modules/local/ampliconarchitect/ampliconarchitect.nf b/modules/local/ampliconarchitect/ampliconarchitect.nf index a79daeeb..3a124d63 100644 --- a/modules/local/ampliconarchitect/ampliconarchitect.nf +++ b/modules/local/ampliconarchitect/ampliconarchitect.nf @@ -27,15 +27,14 @@ process AMPLICONARCHITECT_AMPLICONARCHITECT { export AA_DATA_REPO=${params.aa_data_repo} export MOSEKLM_LICENSE_FILE=${params.mosek_license_dir} - # Define Variables AA_SRC and AC_SRC if ! command -v AmpliconArchitect.py &> /dev/null; then - export AA_SRC=\$(dirname \$(python -c "import ampliconarchitectlib; print(ampliconarchitectlib.__file__)") + export AA_SRC=\$(dirname \$(python -c "import ampliconarchitectlib; print(ampliconarchitectlib.__file__)")) else export AA_SRC=\$(dirname \$(readlink -f \$(which AmpliconArchitect.py))) fi if ! command -v amplicon_classifier.py &> /dev/null; then - export AC_SRC=\$(dirname \$(python -c "import ampliconclassifierlib; print(ampliconclassifierlib.__file__)") + export AC_SRC=\$(dirname \$(python -c "import ampliconclassifierlib; print(ampliconclassifierlib.__file__)")) else export AC_SRC=\$(dirname \$(readlink -f \$(which amplicon_classifier.py))) fi diff --git a/modules/local/ampliconclassifier/ampliconclassifier.nf b/modules/local/ampliconclassifier/ampliconclassifier.nf index ac0a4aae..91749eae 100644 --- a/modules/local/ampliconclassifier/ampliconclassifier.nf +++ b/modules/local/ampliconclassifier/ampliconclassifier.nf @@ -20,15 +20,14 @@ process AMPLICONCLASSIFIER_AMPLICONCLASSIFIER { """ export AA_DATA_REPO=${params.aa_data_repo} - # Define Variables AA_SRC and AC_SRC if ! command -v AmpliconArchitect.py &> /dev/null; then - export AA_SRC=\$(dirname \$(python -c "import ampliconarchitectlib; print(ampliconarchitectlib.__file__)") + export AA_SRC=\$(dirname \$(python -c "import ampliconarchitectlib; print(ampliconarchitectlib.__file__)")) else export AA_SRC=\$(dirname \$(readlink -f \$(which AmpliconArchitect.py))) fi if ! command -v amplicon_classifier.py &> /dev/null; then - export AC_SRC=\$(dirname \$(python -c "import ampliconclassifierlib; print(ampliconclassifierlib.__file__)") + export AC_SRC=\$(dirname \$(python -c "import ampliconclassifierlib; print(ampliconclassifierlib.__file__)")) else export AC_SRC=\$(dirname \$(readlink -f \$(which amplicon_classifier.py))) fi diff --git a/modules/local/ampliconsuite/prepareaa.nf b/modules/local/ampliconsuite/prepareaa.nf index 08c24b7f..9f6519c4 100644 --- a/modules/local/ampliconsuite/prepareaa.nf +++ b/modules/local/ampliconsuite/prepareaa.nf @@ -29,13 +29,13 @@ process PREPAREAA { export MOSEKLM_LICENSE_FILE=${params.mosek_license_dir} # Define Variables AA_SRC and AC_SRC if ! command -v AmpliconArchitect.py &> /dev/null; then - export AA_SRC=\$(dirname \$(python -c "import ampliconarchitectlib; print(ampliconarchitectlib.__file__)") + export AA_SRC=\$(dirname \$(python -c "import ampliconarchitectlib; print(ampliconarchitectlib.__file__)")) else export AA_SRC=\$(dirname \$(readlink -f \$(which AmpliconArchitect.py))) fi if ! command -v amplicon_classifier.py &> /dev/null; then - export AC_SRC=\$(dirname \$(python -c "import ampliconclassifierlib; print(ampliconclassifierlib.__file__)") + export AC_SRC=\$(dirname \$(python -c "import ampliconclassifierlib; print(ampliconclassifierlib.__file__)")) else export AC_SRC=\$(dirname \$(readlink -f \$(which amplicon_classifier.py))) fi @@ -51,7 +51,7 @@ process PREPAREAA { cat <<-END_VERSIONS > versions.yml "${task.process}": - AmpliconSuite-pipeline.py: \$(echo \$(AmpliconSuite-pipeline.py --version) | sed 's/^.*PrepareAA version //') + AmpliconSuite-pipeline.py: \$(AmpliconSuite-pipeline.py --version | sed 's/AmpliconSuite-pipeline version //') END_VERSIONS """ @@ -76,7 +76,7 @@ process PREPAREAA { cat <<-END_VERSIONS > versions.yml "${task.process}": - prepareaa: \$(echo \$(PrepareAA.py --version) | sed 's/^.*PrepareAA version //') + AmpliconSuite-pipeline.py: \$(AmpliconSuite-pipeline.py --version | sed 's/AmpliconSuite-pipeline version //') END_VERSIONS """ } From 9f2330a7d288dda6be1c4cb0ce24b6b2a92ea14a Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Fri, 5 Jan 2024 07:33:31 +0000 Subject: [PATCH 21/48] Updates --- workflows/circdna.nf | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/workflows/circdna.nf b/workflows/circdna.nf index 34e03ff8..542b30ef 100644 --- a/workflows/circdna.nf +++ b/workflows/circdna.nf @@ -54,9 +54,11 @@ if (params.bwa_index) { // AMPLICON ARCHITECT INPUT if (run_ampliconarchitect) { - mosek_license_dir = file(params.mosek_license_dir) - if (!mosek_license_dir.exists()) { + mosek_license_dir = params.mosek_license_dir + if (!params.mosek_license_dir) { exit 1, "Mosek License Directory is missing! Please specifiy directory containing mosek license using --mosek_license_dir and rename license to 'mosek.lic'." + } else { + mosek_license_dir = file(params.mosek_license_dir) } if (!params.aa_data_repo) { exit 1, "AmpliconArchitect Data Repository Missing! Please see https://github.com/jluebeck/AmpliconArchitect for more information and specify its absolute path using --aa_data_repo." } if (params.reference_build != "hg19" & params.reference_build != "GRCh38" & params.reference_build != "GRCh37" & params.reference_build != "mm10"){ From 9ee14ebb5ee7d3eec22b949828894826c8e132fd Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Sat, 6 Jan 2024 08:11:38 +0000 Subject: [PATCH 22/48] Updates on ampliconsuite, removed ampliconarchitect, not finished yet --- CHANGELOG.md | 30 +++++++++++++++++++++ conf/modules.config | 62 +++++++++----------------------------------- workflows/circdna.nf | 33 +++-------------------- 3 files changed, 45 insertions(+), 80 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 01a019c1..6740af19 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,36 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## v1.0.5 - [2023-06-26] + +### `Added` + +- AmpliconSuite process + +### `Fixed` + +- nf-core template + +### `Dependencies` + +- Added docker container dependency + +### `Deprecated` + +- AmpliconArchitect and AmpliconClassifier - Both used in AmpliconSuite + +## v1.0.4 - [2023-06-26] + +### `Added` + +### `Fixed` + +- Bug that the pipeline only runs with one sample when Picard Markduplicates is used + +### `Dependencies` + +### `Deprecated` + ## v1.0.3 - [2023-05-26] ### `Added` diff --git a/conf/modules.config b/conf/modules.config index 48a73801..f833247d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -307,52 +307,41 @@ process { } // -// AmpliconArchitect Options +// AmpliconSuite Options // process { - withName: 'PREPAREAA' { + withName: 'AMPLICONSUITE' { + time = '96.h' ext.args = "" publishDir = [ [ - path: { "${params.outdir}/prepareaa" }, + path: { "${params.outdir}/ampliconsuite/cnvkit" }, mode: params.publish_dir_mode, pattern: '*{CNV_SEEDS.bed,filtered.bed}', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ], [ - path: { "${params.outdir}/prepareaa/cnvkit" }, + path: { "${params.outdir}/ampliconsuite/cnvkit" }, mode: params.publish_dir_mode, - pattern: '*{call.cns,cnr.gz,md.cns,CALLS.bed}', + pattern: '*{CNV_SEEDS.bed,pre_filtered.bed,call.cns,cnr.gz,md.cns,CALLS.bed}', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ], [ - path: { "${params.outdir}/prepareaa/logs" }, + path: { "${params.outdir}/ampliconsuite/ampliconarchitect/logs" }, mode: params.publish_dir_mode, - pattern: '*{.log,.json,log.txt}', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - ] - } - withName: 'AMPLICONARCHITECT_AMPLICONARCHITECT' { - time = '96.h' - ext.args = "" - publishDir = [ - [ - path: { "${params.outdir}/ampliconarchitect/sv_view" }, - mode: params.publish_dir_mode, - pattern: '*.{png,pdf}', + pattern: '*{logs.txt}', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ], [ - path: { "${params.outdir}/ampliconarchitect/amplicons" }, + path: { "${params.outdir}/ampliconsuite/logs" }, mode: params.publish_dir_mode, - pattern: '*{graph.txt,cycles.txt}', + pattern: '*{log.txt,summary.txt,sample_metadata.json,run_metadata.json,finish_flag.txt}', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ], [ - path: { "${params.outdir}/ampliconarchitect/logs" }, + path: { "${params.outdir}/ampliconsuite/ampliconarchitect/" }, mode: params.publish_dir_mode, - pattern: '*{log.txt,summary.txt,sample_metadata.json,run_metadata.json,finish_flag.txt}', + pattern: '*{graph.txt,cycles.txt}', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ], [ @@ -363,33 +352,6 @@ process { ], ] } - withName: 'AMPLICONCLASSIFIER_AMPLICONCLASSIFIER' { - ext.args = "--report_complexity --verbose_classification --plotstyle 'individual'" - publishDir = [ - path: { "${params.outdir}/ampliconclassifier/" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: true - ] - } - withName: 'AMPLICONCLASSIFIER_FEATURESIMILARITY' { - ext.args = "" - publishDir = [ - path: { "${params.outdir}/ampliconclassifier/featuresimilarity" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: true - ] - } - withName: 'AMPLICONCLASSIFIER_MAKEINPUT' { - ext.args = "" - publishDir = [ - path: { "${params.outdir}/ampliconclassifier/makeinput" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: true - ] - } withName: 'AMPLICONCLASSIFIER_MAKERESULTSTABLE' { ext.args = "" publishDir = [ diff --git a/workflows/circdna.nf b/workflows/circdna.nf index 542b30ef..11a7d0cc 100644 --- a/workflows/circdna.nf +++ b/workflows/circdna.nf @@ -150,13 +150,7 @@ include { CIRCLEFINDER } from '../modules/local include { CIRCEXPLORER2_PARSE } from '../modules/local/circexplorer2/parse.nf' // AmpliconArchitect -include { CNVKIT_BATCH } from '../modules/local/cnvkit/batch/main.nf' -include { CNVKIT_SEGMENT } from '../modules/local/cnvkit/segment.nf' -include { PREPAREAA } from '../modules/local/ampliconsuite/prepareaa.nf' -include { COLLECT_SEEDS } from '../modules/local/collect_seeds.nf' -include { AMPLIFIED_INTERVALS } from '../modules/local/amplified_intervals.nf' -include { AMPLICONARCHITECT_AMPLICONARCHITECT } from '../modules/local/ampliconarchitect/ampliconarchitect.nf' -include { AMPLICONCLASSIFIER_AMPLICONCLASSIFIER } from '../modules/local/ampliconclassifier/ampliconclassifier.nf' +include { AMPLICONSUITE } from '../modules/local/ampliconsuite/ampliconsuite.nf' // Unicycler include { UNICYCLER } from '../modules/local/unicycler/main.nf' @@ -399,33 +393,12 @@ workflow CIRCDNA { } if (run_ampliconarchitect) { - PREPAREAA ( + AMPLICONSUITE ( ch_bam_sorted ) - ch_versions = ch_versions.mix(PREPAREAA.out.versions) - - AMPLICONARCHITECT_AMPLICONARCHITECT ( - ch_bam_sorted.join(ch_bam_sorted_bai). - join(PREPAREAA.out.bed) - ) - ch_versions = ch_versions.mix(AMPLICONARCHITECT_AMPLICONARCHITECT.out.versions) - - ch_aa_cycles = AMPLICONARCHITECT_AMPLICONARCHITECT.out.cycles. - map {meta, path -> [path]} - ch_aa_graphs = AMPLICONARCHITECT_AMPLICONARCHITECT.out.graph. - map {meta, path -> [path]} - ch_aa_cnseg = AMPLICONARCHITECT_AMPLICONARCHITECT.out.cnseg. - map {meta, path -> [path]} - - AMPLICONCLASSIFIER_AMPLICONCLASSIFIER ( - ch_aa_graphs.flatten().collect().ifEmpty([]), - ch_aa_cycles.flatten().collect().ifEmpty([]), - ch_aa_cnseg.flatten().collect().ifEmpty([]) - ) - ch_versions = ch_versions.mix(AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.versions) + ch_versions = ch_versions.mix(AMPLICONSUITE.out.versions) } - // // SUBWORKFLOW - RUN CIRCLE_FINDER PIPELINE // From 04192861fb0d345108ef044513a57f310d63f8bb Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Sat, 6 Jan 2024 08:51:56 +0000 Subject: [PATCH 23/48] Template update for nf-core/tools v2.11.1 --- .devcontainer/devcontainer.json | 1 + .github/CONTRIBUTING.md | 8 +- .github/ISSUE_TEMPLATE/bug_report.yml | 2 +- .github/PULL_REQUEST_TEMPLATE.md | 1 + .github/workflows/awsfulltest.yml | 12 +- .github/workflows/awstest.yml | 10 +- .github/workflows/ci.yml | 73 +-- .github/workflows/fix-linting.yml | 4 +- .github/workflows/linting.yml | 14 +- .github/workflows/release-announcements.yml | 68 +++ .gitpod.yml | 9 +- CHANGELOG.md | 2 +- CITATIONS.md | 5 + CODE_OF_CONDUCT.md | 133 ++++- README.md | 20 +- assets/methods_description_template.yml | 12 +- assets/multiqc_config.yml | 4 +- assets/slackreport.json | 2 +- conf/modules.config | 3 +- conf/test_full.config | 2 - docs/output.md | 5 +- docs/usage.md | 30 +- lib/NfcoreSchema.groovy | 530 ------------------ lib/NfcoreTemplate.groovy | 36 +- lib/WorkflowMain.groovy | 37 -- main.nf | 18 + modules.json | 6 +- modules/local/ampliconsuite/ampliconsuite.nf | 95 ++++ .../dumpsoftwareversions/environment.yml | 7 + .../custom/dumpsoftwareversions/meta.yml | 7 +- .../dumpsoftwareversions/tests/main.nf.test | 38 ++ .../tests/main.nf.test.snap | 27 + .../dumpsoftwareversions/tests/tags.yml | 2 + modules/nf-core/fastqc/environment.yml | 7 + modules/nf-core/fastqc/meta.yml | 5 + modules/nf-core/fastqc/tests/main.nf.test | 109 ++++ .../nf-core/fastqc/tests/main.nf.test.snap | 10 + modules/nf-core/fastqc/tests/tags.yml | 2 + modules/nf-core/multiqc/environment.yml | 7 + modules/nf-core/multiqc/meta.yml | 11 +- modules/nf-core/multiqc/tests/main.nf.test | 63 +++ modules/nf-core/multiqc/tests/tags.yml | 2 + nextflow.config | 62 +- nextflow_schema.json | 37 +- workflows/circdna.nf | 32 +- 45 files changed, 777 insertions(+), 793 deletions(-) create mode 100644 .github/workflows/release-announcements.yml delete mode 100755 lib/NfcoreSchema.groovy create mode 100644 modules/local/ampliconsuite/ampliconsuite.nf create mode 100644 modules/nf-core/custom/dumpsoftwareversions/environment.yml create mode 100644 modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test create mode 100644 modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap create mode 100644 modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml create mode 100644 modules/nf-core/fastqc/environment.yml create mode 100644 modules/nf-core/fastqc/tests/main.nf.test create mode 100644 modules/nf-core/fastqc/tests/main.nf.test.snap create mode 100644 modules/nf-core/fastqc/tests/tags.yml create mode 100644 modules/nf-core/multiqc/environment.yml create mode 100644 modules/nf-core/multiqc/tests/main.nf.test create mode 100644 modules/nf-core/multiqc/tests/tags.yml diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index ea27a584..4ecfbfe3 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -2,6 +2,7 @@ "name": "nfcore", "image": "nfcore/gitpod:latest", "remoteUser": "gitpod", + "runArgs": ["--privileged"], // Configure tool-specific properties. "customizations": { diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 12d0b7ac..a3ebc30c 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -9,7 +9,9 @@ Please use the pre-filled template to save time. However, don't be put off by this template - other more general issues and suggestions are welcome! Contributions to the code are even more welcome ;) -> If you need help using or modifying nf-core/circdna then the best place to ask is on the nf-core Slack [#circdna](https://nfcore.slack.com/channels/circdna) channel ([join our Slack here](https://nf-co.re/join/slack)). +:::info +If you need help using or modifying nf-core/circdna then the best place to ask is on the nf-core Slack [#circdna](https://nfcore.slack.com/channels/circdna) channel ([join our Slack here](https://nf-co.re/join/slack)). +::: ## Contribution workflow @@ -25,6 +27,9 @@ If you're not used to this workflow with git, you can start with some [docs from ## Tests +You can optionally test your changes by running the pipeline locally. Then it is recommended to use the `debug` profile to +receive warnings about process selectors and other debug info. Example: `nextflow run . -profile debug,test,docker --outdir `. + When you create a pull request with changes, [GitHub Actions](https://github.com/features/actions) will run automatic tests. Typically, pull-requests are only fully reviewed when these tests are passing, though of course we can help out before then. @@ -116,4 +121,3 @@ To get started: Devcontainer specs: - [DevContainer config](.devcontainer/devcontainer.json) -- [Dockerfile](.devcontainer/Dockerfile) diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index dc1c8693..078e735f 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -42,7 +42,7 @@ body: attributes: label: System information description: | - * Nextflow version _(eg. 22.10.1)_ + * Nextflow version _(eg. 23.04.0)_ * Hardware _(eg. HPC, Desktop, Cloud)_ * Executor _(eg. slurm, local, awsbatch)_ * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter, Charliecloud, or Apptainer)_ diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 6ef92dc7..7e1699b3 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -19,6 +19,7 @@ Learn more about contributing: [CONTRIBUTING.md](https://github.com/nf-core/circ - [ ] If necessary, also make a PR on the nf-core/circdna _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. - [ ] Make sure your code lints (`nf-core lint`). - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir `). +- [ ] Check for unexpected warnings in debug mode (`nextflow run . -profile debug,test,docker --outdir `). - [ ] Usage Documentation in `docs/usage.md` is updated. - [ ] Output Documentation in `docs/output.md` is updated. - [ ] `CHANGELOG.md` is updated. diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index bdf169ee..f39d751b 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -14,20 +14,26 @@ jobs: runs-on: ubuntu-latest steps: - name: Launch workflow via tower - uses: nf-core/tower-action@v3 + uses: seqeralabs/action-tower-launch@v2 + # TODO nf-core: You can customise AWS full pipeline tests as required # Add full size test data (but still relatively small datasets for few samples) # on the `test_full.config` test runs with only one set of parameters with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} + revision: ${{ github.sha }} workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/circdna/work-${{ github.sha }} parameters: | { + "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}", "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/circdna/results-${{ github.sha }}" } - profiles: test_full,aws_tower + profiles: test_full + - uses: actions/upload-artifact@v3 with: name: Tower debug log file - path: tower_action_*.log + path: | + tower_action_*.log + tower_action_*.json diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml index 3cae2f04..6341d883 100644 --- a/.github/workflows/awstest.yml +++ b/.github/workflows/awstest.yml @@ -12,18 +12,22 @@ jobs: steps: # Launch workflow using Tower CLI tool action - name: Launch workflow via tower - uses: seqeralabs/action-tower-launch@v1 + uses: seqeralabs/action-tower-launch@v2 with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} + revision: ${{ github.sha }} workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/circdna/work-${{ github.sha }} parameters: | { "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/circdna/results-test-${{ github.sha }}" } - profiles: test,aws_tower + profiles: test + - uses: actions/upload-artifact@v3 with: name: Tower debug log file - path: tower_action_*.log + path: | + tower_action_*.log + tower_action_*.json diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c5425869..c62a50e0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,11 +24,11 @@ jobs: strategy: matrix: NXF_VER: - - "22.10.1" + - "23.04.0" - "latest-everything" steps: - name: Check out pipeline code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Nextflow uses: nf-core/setup-nextflow@v1 @@ -36,75 +36,8 @@ jobs: version: "${{ matrix.NXF_VER }}" - name: Run pipeline with test data + # TODO nf-core: You can customise CI pipeline run tests as required # For example: adding multiple test runs with different parameters # Remember that you can parallelise this by using strategy.matrix run: | nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results - - test_keep_duplicates: - name: Run pipeline with test data, but remove marked duplicates - # Only run on push if this is the nf-core dev branch (merged PRs) - if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/circdna') }}" - runs-on: ubuntu-latest - strategy: - matrix: - NXF_VER: - - "22.10.1" - - "latest-everything" - steps: - - name: Check out pipeline code - uses: actions/checkout@v2 - - - name: Install Nextflow - uses: nf-core/setup-nextflow@v1 - with: - version: "${{ matrix.NXF_VER }}" - - name: Run pipeline with test data, but remove marked duplicates - # For example: adding multiple test runs with different parameters - # Remember that you can parallelise this by using strategy.matrix - run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results --keep_duplicates false - - test_skip_markduplicates: - name: Run pipeline with test data, but remove marked duplicates - # Only run on push if this is the nf-core dev branch (merged PRs) - if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/circdna') }}" - runs-on: ubuntu-latest - strategy: - matrix: - NXF_VER: - - "22.10.1" - - "latest-everything" - steps: - - name: Check out pipeline code - uses: actions/checkout@v2 - - - name: Install Nextflow - uses: nf-core/setup-nextflow@v1 - with: - version: "${{ matrix.NXF_VER }}" - - name: Run pipeline with test data, but remove marked duplicates - # For example: adding multiple test runs with different parameters - # Remember that you can parallelise this by using strategy.matrix - run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results --skip_markduplicates - - ampliconarchitect: - if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/circdna') }}" - runs-on: ubuntu-latest - name: - Run pipeline with test_AA to test functionality of AmpliconArchitect - # For example: adding multiple test runs with different parameters - # Remember that you can parallelise this by using strategy.matrix - steps: - - name: Check out pipeline code - uses: actions/checkout@v2 - - - name: Install Nextflow - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ - - - name: Run pipeline with AmpliconArchitect - run: | - nextflow run ${GITHUB_WORKSPACE} -profile test_AA,docker --outdir ./results diff --git a/.github/workflows/fix-linting.yml b/.github/workflows/fix-linting.yml index 8de169a4..eba9f84d 100644 --- a/.github/workflows/fix-linting.yml +++ b/.github/workflows/fix-linting.yml @@ -13,7 +13,7 @@ jobs: runs-on: ubuntu-latest steps: # Use the @nf-core-bot token to check out so we can push later - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: token: ${{ secrets.nf_core_bot_auth_token }} @@ -24,7 +24,7 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.nf_core_bot_auth_token }} - - uses: actions/setup-node@v3 + - uses: actions/setup-node@v4 - name: Install Prettier run: npm install -g prettier @prettier/plugin-php diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 888cb4bc..905c58e4 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -14,9 +14,9 @@ jobs: EditorConfig: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - - uses: actions/setup-node@v3 + - uses: actions/setup-node@v4 - name: Install editorconfig-checker run: npm install -g editorconfig-checker @@ -27,9 +27,9 @@ jobs: Prettier: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - - uses: actions/setup-node@v3 + - uses: actions/setup-node@v4 - name: Install Prettier run: npm install -g prettier @@ -40,7 +40,7 @@ jobs: PythonBlack: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Check code lints with Black uses: psf/black@stable @@ -71,14 +71,14 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out pipeline code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Nextflow uses: nf-core/setup-nextflow@v1 - uses: actions/setup-python@v4 with: - python-version: "3.8" + python-version: "3.11" architecture: "x64" - name: Install dependencies diff --git a/.github/workflows/release-announcements.yml b/.github/workflows/release-announcements.yml new file mode 100644 index 00000000..6ad33927 --- /dev/null +++ b/.github/workflows/release-announcements.yml @@ -0,0 +1,68 @@ +name: release-announcements +# Automatic release toot and tweet anouncements +on: + release: + types: [published] + workflow_dispatch: + +jobs: + toot: + runs-on: ubuntu-latest + steps: + - uses: rzr/fediverse-action@master + with: + access-token: ${{ secrets.MASTODON_ACCESS_TOKEN }} + host: "mstdn.science" # custom host if not "mastodon.social" (default) + # GitHub event payload + # https://docs.github.com/en/developers/webhooks-and-events/webhooks/webhook-events-and-payloads#release + message: | + Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! + + Please see the changelog: ${{ github.event.release.html_url }} + + send-tweet: + runs-on: ubuntu-latest + + steps: + - uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: Install dependencies + run: pip install tweepy==4.14.0 + - name: Send tweet + shell: python + run: | + import os + import tweepy + + client = tweepy.Client( + access_token=os.getenv("TWITTER_ACCESS_TOKEN"), + access_token_secret=os.getenv("TWITTER_ACCESS_TOKEN_SECRET"), + consumer_key=os.getenv("TWITTER_CONSUMER_KEY"), + consumer_secret=os.getenv("TWITTER_CONSUMER_SECRET"), + ) + tweet = os.getenv("TWEET") + client.create_tweet(text=tweet) + env: + TWEET: | + Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! + + Please see the changelog: ${{ github.event.release.html_url }} + TWITTER_CONSUMER_KEY: ${{ secrets.TWITTER_CONSUMER_KEY }} + TWITTER_CONSUMER_SECRET: ${{ secrets.TWITTER_CONSUMER_SECRET }} + TWITTER_ACCESS_TOKEN: ${{ secrets.TWITTER_ACCESS_TOKEN }} + TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }} + + bsky-post: + runs-on: ubuntu-latest + steps: + - uses: zentered/bluesky-post-action@v0.0.2 + with: + post: | + Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! + + Please see the changelog: ${{ github.event.release.html_url }} + env: + BSKY_IDENTIFIER: ${{ secrets.BSKY_IDENTIFIER }} + BSKY_PASSWORD: ${{ secrets.BSKY_PASSWORD }} + # diff --git a/.gitpod.yml b/.gitpod.yml index 85d95ecc..acf72695 100644 --- a/.gitpod.yml +++ b/.gitpod.yml @@ -1,5 +1,12 @@ image: nfcore/gitpod:latest - +tasks: + - name: Update Nextflow and setup pre-commit + command: | + pre-commit install --install-hooks + nextflow self-update + - name: unset JAVA_TOOL_OPTIONS + command: | + unset JAVA_TOOL_OPTIONS vscode: extensions: # based on nf-core.nf-core-extensionpack - codezombiech.gitignore # Language support for .gitignore files diff --git a/CHANGELOG.md b/CHANGELOG.md index 6740af19..e09f0352 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Fixed` -- nf-core template +- nf-core template update to 2.11.1 ### `Dependencies` diff --git a/CITATIONS.md b/CITATIONS.md index da0b48b9..c3943f33 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -12,6 +12,8 @@ - [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) + > Andrews, S. (2010). FastQC: A Quality Control Tool for High Throughput Sequence Data [Online]. + - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. @@ -77,5 +79,8 @@ - [Docker](https://dl.acm.org/doi/10.5555/2600239.2600241) + > Merkel, D. (2014). Docker: lightweight linux containers for consistent development and deployment. Linux Journal, 2014(239), 2. doi: 10.5555/2600239.2600241. + - [Singularity](https://pubmed.ncbi.nlm.nih.gov/28494014/) + > Kurtzer GM, Sochat V, Bauer MW. Singularity: Scientific containers for mobility of compute. PLoS One. 2017 May 11;12(5):e0177459. doi: 10.1371/journal.pone.0177459. eCollection 2017. PubMed PMID: 28494014; PubMed Central PMCID: PMC5426675. diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index f4fd052f..c089ec78 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,18 +1,20 @@ -# Code of Conduct at nf-core (v1.0) +# Code of Conduct at nf-core (v1.4) ## Our Pledge -In the interest of fostering an open, collaborative, and welcoming environment, we as contributors and maintainers of nf-core, pledge to making participation in our projects and community a harassment-free experience for everyone, regardless of: +In the interest of fostering an open, collaborative, and welcoming environment, we as contributors and maintainers of nf-core pledge to making participation in our projects and community a harassment-free experience for everyone, regardless of: - Age +- Ability - Body size +- Caste - Familial status - Gender identity and expression - Geographical location - Level of experience - Nationality and national origins - Native language -- Physical and neurological ability +- Neurodiversity - Race or ethnicity - Religion - Sexual identity and orientation @@ -22,80 +24,133 @@ Please note that the list above is alphabetised and is therefore not ranked in a ## Preamble -> Note: This Code of Conduct (CoC) has been drafted by the nf-core Safety Officer and been edited after input from members of the nf-core team and others. "We", in this document, refers to the Safety Officer and members of the nf-core core team, both of whom are deemed to be members of the nf-core community and are therefore required to abide by this Code of Conduct. This document will amended periodically to keep it up-to-date, and in case of any dispute, the most current version will apply. +:::note +This Code of Conduct (CoC) has been drafted by Renuka Kudva, Cris Tuñí, and Michael Heuer, with input from the nf-core Core Team and Susanna Marquez from the nf-core community. "We", in this document, refers to the Safety Officers and members of the nf-core Core Team, both of whom are deemed to be members of the nf-core community and are therefore required to abide by this Code of Conduct. This document will be amended periodically to keep it up-to-date. In case of any dispute, the most current version will apply. +::: -An up-to-date list of members of the nf-core core team can be found [here](https://nf-co.re/about). Our current safety officer is Renuka Kudva. +An up-to-date list of members of the nf-core core team can be found [here](https://nf-co.re/about). + +Our Safety Officers are Saba Nafees, Cris Tuñí, and Michael Heuer. nf-core is a young and growing community that welcomes contributions from anyone with a shared vision for [Open Science Policies](https://www.fosteropenscience.eu/taxonomy/term/8). Open science policies encompass inclusive behaviours and we strive to build and maintain a safe and inclusive environment for all individuals. -We have therefore adopted this code of conduct (CoC), which we require all members of our community and attendees in nf-core events to adhere to in all our workspaces at all times. Workspaces include but are not limited to Slack, meetings on Zoom, Jitsi, YouTube live etc. +We have therefore adopted this CoC, which we require all members of our community and attendees of nf-core events to adhere to in all our workspaces at all times. Workspaces include, but are not limited to, Slack, meetings on Zoom, gather.town, YouTube live etc. -Our CoC will be strictly enforced and the nf-core team reserve the right to exclude participants who do not comply with our guidelines from our workspaces and future nf-core activities. +Our CoC will be strictly enforced and the nf-core team reserves the right to exclude participants who do not comply with our guidelines from our workspaces and future nf-core activities. -We ask all members of our community to help maintain a supportive and productive workspace and to avoid behaviours that can make individuals feel unsafe or unwelcome. Please help us maintain and uphold this CoC. +We ask all members of our community to help maintain supportive and productive workspaces and to avoid behaviours that can make individuals feel unsafe or unwelcome. Please help us maintain and uphold this CoC. -Questions, concerns or ideas on what we can include? Contact safety [at] nf-co [dot] re +Questions, concerns, or ideas on what we can include? Contact members of the Safety Team on Slack or email safety [at] nf-co [dot] re. ## Our Responsibilities -The safety officer is responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behaviour. +Members of the Safety Team (the Safety Officers) are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behaviour. -The safety officer in consultation with the nf-core core team have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. +The Safety Team, in consultation with the nf-core core team, have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this CoC, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. -Members of the core team or the safety officer who violate the CoC will be required to recuse themselves pending investigation. They will not have access to any reports of the violations and be subject to the same actions as others in violation of the CoC. +Members of the core team or the Safety Team who violate the CoC will be required to recuse themselves pending investigation. They will not have access to any reports of the violations and will be subject to the same actions as others in violation of the CoC. -## When are where does this Code of Conduct apply? +## When and where does this Code of Conduct apply? -Participation in the nf-core community is contingent on following these guidelines in all our workspaces and events. This includes but is not limited to the following listed alphabetically and therefore in no order of preference: +Participation in the nf-core community is contingent on following these guidelines in all our workspaces and events, such as hackathons, workshops, bytesize, and collaborative workspaces on gather.town. These guidelines include, but are not limited to, the following (listed alphabetically and therefore in no order of preference): - Communicating with an official project email address. - Communicating with community members within the nf-core Slack channel. - Participating in hackathons organised by nf-core (both online and in-person events). -- Participating in collaborative work on GitHub, Google Suite, community calls, mentorship meetings, email correspondence. -- Participating in workshops, training, and seminar series organised by nf-core (both online and in-person events). This applies to events hosted on web-based platforms such as Zoom, Jitsi, YouTube live etc. +- Participating in collaborative work on GitHub, Google Suite, community calls, mentorship meetings, email correspondence, and on the nf-core gather.town workspace. +- Participating in workshops, training, and seminar series organised by nf-core (both online and in-person events). This applies to events hosted on web-based platforms such as Zoom, gather.town, Jitsi, YouTube live etc. - Representing nf-core on social media. This includes both official and personal accounts. ## nf-core cares 😊 -nf-core's CoC and expectations of respectful behaviours for all participants (including organisers and the nf-core team) include but are not limited to the following (listed in alphabetical order): +nf-core's CoC and expectations of respectful behaviours for all participants (including organisers and the nf-core team) include, but are not limited to, the following (listed in alphabetical order): - Ask for consent before sharing another community member’s personal information (including photographs) on social media. - Be respectful of differing viewpoints and experiences. We are all here to learn from one another and a difference in opinion can present a good learning opportunity. -- Celebrate your accomplishments at events! (Get creative with your use of emojis 🎉 🥳 💯 🙌 !) +- Celebrate your accomplishments! (Get creative with your use of emojis 🎉 🥳 💯 🙌 !) - Demonstrate empathy towards other community members. (We don’t all have the same amount of time to dedicate to nf-core. If tasks are pending, don’t hesitate to gently remind members of your team. If you are leading a task, ask for help if you feel overwhelmed.) - Engage with and enquire after others. (This is especially important given the geographically remote nature of the nf-core community, so let’s do this the best we can) - Focus on what is best for the team and the community. (When in doubt, ask) -- Graciously accept constructive criticism, yet be unafraid to question, deliberate, and learn. +- Accept feedback, yet be unafraid to question, deliberate, and learn. - Introduce yourself to members of the community. (We’ve all been outsiders and we know that talking to strangers can be hard for some, but remember we’re interested in getting to know you and your visions for open science!) -- Show appreciation and **provide clear feedback**. (This is especially important because we don’t see each other in person and it can be harder to interpret subtleties. Also remember that not everyone understands a certain language to the same extent as you do, so **be clear in your communications to be kind.**) +- Show appreciation and **provide clear feedback**. (This is especially important because we don’t see each other in person and it can be harder to interpret subtleties. Also remember that not everyone understands a certain language to the same extent as you do, so **be clear in your communication to be kind.**) - Take breaks when you feel like you need them. -- Using welcoming and inclusive language. (Participants are encouraged to display their chosen pronouns on Zoom or in communication on Slack.) +- Use welcoming and inclusive language. (Participants are encouraged to display their chosen pronouns on Zoom or in communication on Slack) ## nf-core frowns on 😕 -The following behaviours from any participants within the nf-core community (including the organisers) will be considered unacceptable under this code of conduct. Engaging or advocating for any of the following could result in expulsion from nf-core workspaces. +The following behaviours from any participants within the nf-core community (including the organisers) will be considered unacceptable under this CoC. Engaging or advocating for any of the following could result in expulsion from nf-core workspaces: - Deliberate intimidation, stalking or following and sustained disruption of communication among participants of the community. This includes hijacking shared screens through actions such as using the annotate tool in conferencing software such as Zoom. - “Doxing” i.e. posting (or threatening to post) another person’s personal identifying information online. - Spamming or trolling of individuals on social media. -- Use of sexual or discriminatory imagery, comments, or jokes and unwelcome sexual attention. -- Verbal and text comments that reinforce social structures of domination related to gender, gender identity and expression, sexual orientation, ability, physical appearance, body size, race, age, religion or work experience. +- Use of sexual or discriminatory imagery, comments, jokes, or unwelcome sexual attention. +- Verbal and text comments that reinforce social structures of domination related to gender, gender identity and expression, sexual orientation, ability, physical appearance, body size, race, age, religion, or work experience. ### Online Trolling -The majority of nf-core interactions and events are held online. Unfortunately, holding events online comes with the added issue of online trolling. This is unacceptable, reports of such behaviour will be taken very seriously, and perpetrators will be excluded from activities immediately. +The majority of nf-core interactions and events are held online. Unfortunately, holding events online comes with the risk of online trolling. This is unacceptable — reports of such behaviour will be taken very seriously and perpetrators will be excluded from activities immediately. -All community members are required to ask members of the group they are working within for explicit consent prior to taking screenshots of individuals during video calls. +All community members are **required** to ask members of the group they are working with for explicit consent prior to taking screenshots of individuals during video calls. -## Procedures for Reporting CoC violations +## Procedures for reporting CoC violations If someone makes you feel uncomfortable through their behaviours or actions, report it as soon as possible. -You can reach out to members of the [nf-core core team](https://nf-co.re/about) and they will forward your concerns to the safety officer(s). +You can reach out to members of the Safety Team (Saba Nafees, Cris Tuñí, and Michael Heuer) on Slack. Alternatively, contact a member of the nf-core core team [nf-core core team](https://nf-co.re/about), and they will forward your concerns to the Safety Team. + +Issues directly concerning members of the Core Team or the Safety Team will be dealt with by other members of the core team and the safety manager — possible conflicts of interest will be taken into account. nf-core is also in discussions about having an ombudsperson and details will be shared in due course. + +All reports will be handled with the utmost discretion and confidentiality. + +You can also report any CoC violations to safety [at] nf-co [dot] re. In your email report, please do your best to include: + +- Your contact information. +- Identifying information (e.g. names, nicknames, pseudonyms) of the participant who has violated the Code of Conduct. +- The behaviour that was in violation and the circumstances surrounding the incident. +- The approximate time of the behaviour (if different than the time the report was made). +- Other people involved in the incident, if applicable. +- If you believe the incident is ongoing. +- If there is a publicly available record (e.g. mailing list record, a screenshot). +- Any additional information. + +After you file a report, one or more members of our Safety Team will contact you to follow up on your report. + +## Who will read and handle reports + +All reports will be read and handled by the members of the Safety Team at nf-core. + +If members of the Safety Team are deemed to have a conflict of interest with a report, they will be required to recuse themselves as per our Code of Conduct and will not have access to any follow-ups. + +To keep this first report confidential from any of the Safety Team members, please submit your first report by direct messaging on Slack/direct email to any of the nf-core members you are comfortable disclosing the information to, and be explicit about which member(s) you do not consent to sharing the information with. + +## Reviewing reports + +After receiving the report, members of the Safety Team will review the incident report to determine whether immediate action is required, for example, whether there is immediate threat to participants’ safety. + +The Safety Team, in consultation with members of the nf-core core team, will assess the information to determine whether the report constitutes a Code of Conduct violation, for them to decide on a course of action. + +In the case of insufficient information, one or more members of the Safety Team may contact the reporter, the reportee, or any other attendees to obtain more information. -Issues directly concerning members of the core team will be dealt with by other members of the core team and the safety manager, and possible conflicts of interest will be taken into account. nf-core is also in discussions about having an ombudsperson, and details will be shared in due course. +Once additional information is gathered, the Safety Team will collectively review and decide on the best course of action to take, if any. The Safety Team reserves the right to not act on a report. -All reports will be handled with utmost discretion and confidentially. +## Confidentiality + +All reports, and any additional information included, are only shared with the team of safety officers (and possibly members of the core team, in case the safety officer is in violation of the CoC). We will respect confidentiality requests for the purpose of protecting victims of abuse. + +We will not name harassment victims, beyond discussions between the safety officer and members of the nf-core team, without the explicit consent of the individuals involved. + +## Enforcement + +Actions taken by the nf-core’s Safety Team may include, but are not limited to: + +- Asking anyone to stop a behaviour. +- Asking anyone to leave the event and online spaces either temporarily, for the remainder of the event, or permanently. +- Removing access to the gather.town and Slack, either temporarily or permanently. +- Communicating to all participants to reinforce our expectations for conduct and remind what is unacceptable behaviour; this may be public for practical reasons. +- Communicating to all participants that an incident has taken place and how we will act or have acted — this may be for the purpose of letting event participants know we are aware of and dealing with the incident. +- Banning anyone from participating in nf-core-managed spaces, future events, and activities, either temporarily or permanently. +- No action. ## Attribution and Acknowledgements @@ -106,6 +161,22 @@ All reports will be handled with utmost discretion and confidentially. ## Changelog -### v1.0 - March 12th, 2021 +### v1.4 - February 8th, 2022 + +- Included a new member of the Safety Team. Corrected a typographical error in the text. + +### v1.3 - December 10th, 2021 + +- Added a statement that the CoC applies to nf-core gather.town workspaces. Corrected typographical errors in the text. + +### v1.2 - November 12th, 2021 + +- Removed information specific to reporting CoC violations at the Hackathon in October 2021. + +### v1.1 - October 14th, 2021 + +- Updated with names of new Safety Officers and specific information for the hackathon in October 2021. + +### v1.0 - March 15th, 2021 - Complete rewrite from original [Contributor Covenant](http://contributor-covenant.org/) CoC. diff --git a/README.md b/README.md index ff6a4203..62398884 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,9 @@ # ![nf-core/circdna](docs/images/nf-core-circdna_logo_light.png#gh-light-mode-only) ![nf-core/circdna](docs/images/nf-core-circdna_logo_dark.png#gh-dark-mode-only) -[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/circdna/results) [![Cite with Zenodo](https://zenodo.org/badge/DOI/10.5281/zenodo.6685250.svg)](https://doi.org/10.5281/zenodo.6685250) +[![GitHub Actions CI Status](https://github.com/nf-core/circdna/workflows/nf-core%20CI/badge.svg)](https://github.com/nf-core/circdna/actions?query=workflow%3A%22nf-core+CI%22) +[![GitHub Actions Linting Status](https://github.com/nf-core/circdna/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/circdna/actions?query=workflow%3A%22nf-core+linting%22)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/circdna/results)[![Cite with Zenodo](https://zenodo.org/badge/DOI/10.5281/zenodo.6685250.svg)](https://doi.org/10.5281/zenodo.6685250) -[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A522.10.1-23aa62.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.04.0-23aa62.svg)](https://www.nextflow.io/) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg)](https://sylabs.io/docs/) @@ -44,10 +45,8 @@ A graphical view of the pipeline and its diverse branches can be seen below. ## Usage -> **Note** -> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how -> to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) -> with `-profile test` before running the workflow on actual data. +> [!NOTE] +> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. First, prepare a samplesheet with your input data that looks as follows: @@ -91,16 +90,15 @@ Please specify the parameter `circle_identifier` depending on the pipeline branc > `unicycler` uses [Unicycler](https://github.com/rrwick/Unicycler) for de novo assembly of ecDNAs and [Minimap2](https://github.com/lh3/minimap2) for accurate mapping of the identified circular sequences. -> **Warning:** -> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those -> provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; +> [!WARNING] +> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; > see [docs](https://nf-co.re/usage/configuration#custom-configuration-files). -For more details, please refer to the [usage documentation](https://nf-co.re/circdna/usage) and the [parameter documentation](https://nf-co.re/circdna/parameters). +For more details and further functionality, please refer to the [usage documentation](https://nf-co.re/circdna/usage) and the [parameter documentation](https://nf-co.re/circdna/parameters). ## Pipeline output -To see the the results of a test run with a full size dataset refer to the [results](https://nf-co.re/circdna/results) tab on the nf-core website pipeline page. +To see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/circdna/results) tab on the nf-core website pipeline page. For more details about the output files and reports, please refer to the [output documentation](https://nf-co.re/circdna/output). diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml index 418eb9b5..ecac477c 100644 --- a/assets/methods_description_template.yml +++ b/assets/methods_description_template.yml @@ -3,17 +3,21 @@ description: "Suggested text and references to use when describing pipeline usag section_name: "nf-core/circdna Methods Description" section_href: "https://github.com/nf-core/circdna" plot_type: "html" -## TODO nf-core: Update the HTML below to your prefered methods description, e.g. add publication citation for this pipeline +## TODO nf-core: Update the HTML below to your preferred methods description, e.g. add publication citation for this pipeline ## You inject any metadata in the Nextflow '${workflow}' object data: |

Methods

-

Data was processed using nf-core/circdna v${workflow.manifest.version} ${doi_text} of the nf-core collection of workflows (Ewels et al., 2020).

+

Data was processed using nf-core/circdna v${workflow.manifest.version} ${doi_text} of the nf-core collection of workflows (Ewels et al., 2020), utilising reproducible software environments from the Bioconda (Grüning et al., 2018) and Biocontainers (da Veiga Leprevost et al., 2017) projects.

The pipeline was executed with Nextflow v${workflow.nextflow.version} (Di Tommaso et al., 2017) with the following command:

${workflow.commandLine}
+

${tool_citations}

References

    -
  • Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., & Notredame, C. (2017). Nextflow enables reproducible computational workflows. Nature Biotechnology, 35(4), 316-319. https://doi.org/10.1038/nbt.3820
  • -
  • Ewels, P. A., Peltzer, A., Fillinger, S., Patel, H., Alneberg, J., Wilm, A., Garcia, M. U., Di Tommaso, P., & Nahnsen, S. (2020). The nf-core framework for community-curated bioinformatics pipelines. Nature Biotechnology, 38(3), 276-278. https://doi.org/10.1038/s41587-020-0439-x
  • +
  • Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., & Notredame, C. (2017). Nextflow enables reproducible computational workflows. Nature Biotechnology, 35(4), 316-319. doi: 10.1038/nbt.3820
  • +
  • Ewels, P. A., Peltzer, A., Fillinger, S., Patel, H., Alneberg, J., Wilm, A., Garcia, M. U., Di Tommaso, P., & Nahnsen, S. (2020). The nf-core framework for community-curated bioinformatics pipelines. Nature Biotechnology, 38(3), 276-278. doi: 10.1038/s41587-020-0439-x
  • +
  • Grüning, B., Dale, R., Sjödin, A., Chapman, B. A., Rowe, J., Tomkins-Tinch, C. H., Valieris, R., Köster, J., & Bioconda Team. (2018). Bioconda: sustainable and comprehensive software distribution for the life sciences. Nature Methods, 15(7), 475–476. doi: 10.1038/s41592-018-0046-7
  • +
  • da Veiga Leprevost, F., Grüning, B. A., Alves Aflitos, S., Röst, H. L., Uszkoreit, J., Barsnes, H., Vaudel, M., Moreno, P., Gatto, L., Weber, J., Bai, M., Jimenez, R. C., Sachsenberg, T., Pfeuffer, J., Vera Alvarez, R., Griss, J., Nesvizhskii, A. I., & Perez-Riverol, Y. (2017). BioContainers: an open-source and community-driven framework for software standardization. Bioinformatics (Oxford, England), 33(16), 2580–2582. doi: 10.1093/bioinformatics/btx192
  • + ${tool_bibliography}
Notes:
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 08bed135..2ad47705 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,7 +1,7 @@ report_comment: > - This report has been generated by the nf-core/circdna + This report has been generated by the nf-core/circdna analysis pipeline. For information about how to interpret these results, please see the - documentation. + documentation. report_section_order: "nf-core-circdna-methods-description": order: -1000 diff --git a/assets/slackreport.json b/assets/slackreport.json index 043d02f2..aa69f8e3 100644 --- a/assets/slackreport.json +++ b/assets/slackreport.json @@ -3,7 +3,7 @@ { "fallback": "Plain-text summary of the attachment.", "color": "<% if (success) { %>good<% } else { %>danger<%} %>", - "author_name": "sanger-tol/readmapping v${version} - ${runName}", + "author_name": "nf-core/circdna ${version} - ${runName}", "author_icon": "https://www.nextflow.io/docs/latest/_static/favicon.ico", "text": "<% if (success) { %>Pipeline completed successfully!<% } else { %>Pipeline completed with errors<% } %>", "fields": [ diff --git a/conf/modules.config b/conf/modules.config index f833247d..ee5b6248 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -482,13 +482,14 @@ process { if (!params.skip_multiqc) { process { withName: 'MULTIQC' { - ext.args = params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' + ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } publishDir = [ path: { "${params.outdir}/multiqc" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } + withName: 'CUSTOM_DUMPSOFTWAREVERSIONS' { publishDir = [ path: { "${params.outdir}/pipeline_info" }, diff --git a/conf/test_full.config b/conf/test_full.config index 18294b84..14a2a4cc 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -10,8 +10,6 @@ ---------------------------------------------------------------------------------------- */ -cleanup = true - params { config_profile_name = 'Full test profile' config_profile_description = 'Full test dataset to check pipeline function' diff --git a/docs/output.md b/docs/output.md index 01cb2072..5947472d 100644 --- a/docs/output.md +++ b/docs/output.md @@ -38,7 +38,9 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). -> **NB:** The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. +:::note +The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. +::: ### TrimGalore @@ -379,6 +381,7 @@ The plots will show: - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. + - Parameters used by the pipeline run: `params.json`. diff --git a/docs/usage.md b/docs/usage.md index c63c06b5..75cbd5ab 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -30,7 +30,7 @@ The two input formats accepted by the pipeline are "FASTQ" and "BAM". If not spe ### FASTQ -```bash +```csv title="samplesheet.csv" sample,fastq_1,fastq_2 circdna_1,circdna_1_R1.fastq.gz,circdna_1_R2.fastq.gz circdna_2,circdna_2_R1.fastq.gz,circdna_2_R2.fastq.gz @@ -47,7 +47,7 @@ An [example samplesheet fastq](../assets/samplesheet.csv) has been provided with ### BAM -```bash +```csv title="samplesheet.csv" sample,bam circdna_1,circdna_1.bam circdna_2,circdna_2.bam @@ -65,7 +65,7 @@ An [example samplesheet bam](../assets/samplesheet_bam.csv) has been provided wi If using FASTQ input, the `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: -```bash +```csv title="samplesheet.csv" sample,fastq_1,fastq_2 CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz @@ -88,7 +88,7 @@ The pipeline can be run from directly from bam files. Here,the samplesheet has t --input '[path to samplesheet file]' ``` -```console +```csv title="samplesheet.csv" sample,bam sample1, sample1.bam sample2, sample2.bam @@ -107,7 +107,7 @@ An [example samplesheet](../assets/samplesheet_bam.csv) has been provided with t The typical command for running the pipeline is as follows: ```bash -nextflow run nf-core/circdna --input samplesheet.csv --outdir --genome GRCh38 -profile docker --circle_identifier +nextflow run nf-core/circdna --input ./samplesheet.csv --outdir ./results --genome GRCh37 -profile docker ``` This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. @@ -120,12 +120,17 @@ work # Directory containing the nextflow working files .nextflow_log # Log file from Nextflow # Other nextflow hidden files, eg. history of pipeline runs and old logs. -If you wish to repeatedly use the same parameters for multiple runs, rather than specifying each flag in the command, you can specify these in a params file. +:::tip +If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. +::: Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `. -> ⚠️ Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). -> The above pipeline run specified with a params file in yaml format: +:::warning +Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). +::: + +The above pipeline run specified with a params file in yaml format: ```bash nextflow run nf-core/circdna -profile docker -params-file params.yaml @@ -137,7 +142,6 @@ with `params.yaml` containing: input: './samplesheet.csv' outdir: './results/' genome: 'GRCh37' -input: 'data' <...> ``` @@ -165,7 +169,9 @@ To further assist in reproducbility, you can use share and re-use [parameter fil ## Core Nextflow arguments -> **NB:** These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). +:::note +These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). +::: ### `-profile` @@ -173,7 +179,9 @@ Use this parameter to choose a configuration profile. Profiles can give configur Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Apptainer, Conda) - see below. -> We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. +:::info +We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. +::: The pipeline also dynamically loads configurations from [https://github.com/nf-core/configs](https://github.com/nf-core/configs) when it runs, making multiple config profiles for various institutional clusters available at run time. For more information and to see if your system is available in these configs please see the [nf-core/configs documentation](https://github.com/nf-core/configs#documentation). diff --git a/lib/NfcoreSchema.groovy b/lib/NfcoreSchema.groovy deleted file mode 100755 index 9b34804d..00000000 --- a/lib/NfcoreSchema.groovy +++ /dev/null @@ -1,530 +0,0 @@ -// -// This file holds several functions used to perform JSON parameter validation, help and summary rendering for the nf-core pipeline template. -// - -import nextflow.Nextflow -import org.everit.json.schema.Schema -import org.everit.json.schema.loader.SchemaLoader -import org.everit.json.schema.ValidationException -import org.json.JSONObject -import org.json.JSONTokener -import org.json.JSONArray -import groovy.json.JsonSlurper -import groovy.json.JsonBuilder - -class NfcoreSchema { - - // - // Resolve Schema path relative to main workflow directory - // - public static String getSchemaPath(workflow, schema_filename='nextflow_schema.json') { - return "${workflow.projectDir}/${schema_filename}" - } - - // - // Function to loop over all parameters defined in schema and check - // whether the given parameters adhere to the specifications - // - /* groovylint-disable-next-line UnusedPrivateMethodParameter */ - public static void validateParameters(workflow, params, log, schema_filename='nextflow_schema.json') { - def has_error = false - //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - // Check for nextflow core params and unexpected params - def json = new File(getSchemaPath(workflow, schema_filename=schema_filename)).text - def Map schemaParams = (Map) new JsonSlurper().parseText(json).get('definitions') - def nf_params = [ - // Options for base `nextflow` command - 'bg', - 'c', - 'C', - 'config', - 'd', - 'D', - 'dockerize', - 'h', - 'log', - 'q', - 'quiet', - 'syslog', - 'v', - - // Options for `nextflow run` command - 'ansi', - 'ansi-log', - 'bg', - 'bucket-dir', - 'c', - 'cache', - 'config', - 'dsl2', - 'dump-channels', - 'dump-hashes', - 'E', - 'entry', - 'latest', - 'lib', - 'main-script', - 'N', - 'name', - 'offline', - 'params-file', - 'pi', - 'plugins', - 'poll-interval', - 'pool-size', - 'profile', - 'ps', - 'qs', - 'queue-size', - 'r', - 'resume', - 'revision', - 'stdin', - 'stub', - 'stub-run', - 'test', - 'w', - 'with-apptainer', - 'with-charliecloud', - 'with-conda', - 'with-dag', - 'with-docker', - 'with-mpi', - 'with-notification', - 'with-podman', - 'with-report', - 'with-singularity', - 'with-timeline', - 'with-tower', - 'with-trace', - 'with-weblog', - 'without-docker', - 'without-podman', - 'work-dir' - ] - def unexpectedParams = [] - - // Collect expected parameters from the schema - def expectedParams = [] - def enums = [:] - for (group in schemaParams) { - for (p in group.value['properties']) { - expectedParams.push(p.key) - if (group.value['properties'][p.key].containsKey('enum')) { - enums[p.key] = group.value['properties'][p.key]['enum'] - } - } - } - - for (specifiedParam in params.keySet()) { - // nextflow params - if (nf_params.contains(specifiedParam)) { - log.error "ERROR: You used a core Nextflow option with two hyphens: '--${specifiedParam}'. Please resubmit with '-${specifiedParam}'" - has_error = true - } - // unexpected params - def params_ignore = params.schema_ignore_params.split(',') + 'schema_ignore_params' - def expectedParamsLowerCase = expectedParams.collect{ it.replace("-", "").toLowerCase() } - def specifiedParamLowerCase = specifiedParam.replace("-", "").toLowerCase() - def isCamelCaseBug = (specifiedParam.contains("-") && !expectedParams.contains(specifiedParam) && expectedParamsLowerCase.contains(specifiedParamLowerCase)) - if (!expectedParams.contains(specifiedParam) && !params_ignore.contains(specifiedParam) && !isCamelCaseBug) { - // Temporarily remove camelCase/camel-case params #1035 - def unexpectedParamsLowerCase = unexpectedParams.collect{ it.replace("-", "").toLowerCase()} - if (!unexpectedParamsLowerCase.contains(specifiedParamLowerCase)){ - unexpectedParams.push(specifiedParam) - } - } - } - - //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - // Validate parameters against the schema - InputStream input_stream = new File(getSchemaPath(workflow, schema_filename=schema_filename)).newInputStream() - JSONObject raw_schema = new JSONObject(new JSONTokener(input_stream)) - - // Remove anything that's in params.schema_ignore_params - raw_schema = removeIgnoredParams(raw_schema, params) - - Schema schema = SchemaLoader.load(raw_schema) - - // Clean the parameters - def cleanedParams = cleanParameters(params) - - // Convert to JSONObject - def jsonParams = new JsonBuilder(cleanedParams) - JSONObject params_json = new JSONObject(jsonParams.toString()) - - // Validate - try { - schema.validate(params_json) - } catch (ValidationException e) { - println '' - log.error 'ERROR: Validation of pipeline parameters failed!' - JSONObject exceptionJSON = e.toJSON() - printExceptions(exceptionJSON, params_json, log, enums) - println '' - has_error = true - } - - // Check for unexpected parameters - if (unexpectedParams.size() > 0) { - Map colors = NfcoreTemplate.logColours(params.monochrome_logs) - println '' - def warn_msg = 'Found unexpected parameters:' - for (unexpectedParam in unexpectedParams) { - warn_msg = warn_msg + "\n* --${unexpectedParam}: ${params[unexpectedParam].toString()}" - } - log.warn warn_msg - log.info "- ${colors.dim}Ignore this warning: params.schema_ignore_params = \"${unexpectedParams.join(',')}\" ${colors.reset}" - println '' - } - - if (has_error) { - Nextflow.error('Exiting!') - } - } - - // - // Beautify parameters for --help - // - public static String paramsHelp(workflow, params, command, schema_filename='nextflow_schema.json') { - Map colors = NfcoreTemplate.logColours(params.monochrome_logs) - Integer num_hidden = 0 - String output = '' - output += 'Typical pipeline command:\n\n' - output += " ${colors.cyan}${command}${colors.reset}\n\n" - Map params_map = paramsLoad(getSchemaPath(workflow, schema_filename=schema_filename)) - Integer max_chars = paramsMaxChars(params_map) + 1 - Integer desc_indent = max_chars + 14 - Integer dec_linewidth = 160 - desc_indent - for (group in params_map.keySet()) { - Integer num_params = 0 - String group_output = colors.underlined + colors.bold + group + colors.reset + '\n' - def group_params = params_map.get(group) // This gets the parameters of that particular group - for (param in group_params.keySet()) { - if (group_params.get(param).hidden && !params.show_hidden_params) { - num_hidden += 1 - continue; - } - def type = '[' + group_params.get(param).type + ']' - def description = group_params.get(param).description - def defaultValue = group_params.get(param).default != null ? " [default: " + group_params.get(param).default.toString() + "]" : '' - def description_default = description + colors.dim + defaultValue + colors.reset - // Wrap long description texts - // Loosely based on https://dzone.com/articles/groovy-plain-text-word-wrap - if (description_default.length() > dec_linewidth){ - List olines = [] - String oline = "" // " " * indent - description_default.split(" ").each() { wrd -> - if ((oline.size() + wrd.size()) <= dec_linewidth) { - oline += wrd + " " - } else { - olines += oline - oline = wrd + " " - } - } - olines += oline - description_default = olines.join("\n" + " " * desc_indent) - } - group_output += " --" + param.padRight(max_chars) + colors.dim + type.padRight(10) + colors.reset + description_default + '\n' - num_params += 1 - } - group_output += '\n' - if (num_params > 0){ - output += group_output - } - } - if (num_hidden > 0){ - output += colors.dim + "!! Hiding $num_hidden params, use --show_hidden_params to show them !!\n" + colors.reset - } - output += NfcoreTemplate.dashedLine(params.monochrome_logs) - return output - } - - // - // Groovy Map summarising parameters/workflow options used by the pipeline - // - public static LinkedHashMap paramsSummaryMap(workflow, params, schema_filename='nextflow_schema.json') { - // Get a selection of core Nextflow workflow options - def Map workflow_summary = [:] - if (workflow.revision) { - workflow_summary['revision'] = workflow.revision - } - workflow_summary['runName'] = workflow.runName - if (workflow.containerEngine) { - workflow_summary['containerEngine'] = workflow.containerEngine - } - if (workflow.container) { - workflow_summary['container'] = workflow.container - } - workflow_summary['launchDir'] = workflow.launchDir - workflow_summary['workDir'] = workflow.workDir - workflow_summary['projectDir'] = workflow.projectDir - workflow_summary['userName'] = workflow.userName - workflow_summary['profile'] = workflow.profile - workflow_summary['configFiles'] = workflow.configFiles.join(', ') - - // Get pipeline parameters defined in JSON Schema - def Map params_summary = [:] - def params_map = paramsLoad(getSchemaPath(workflow, schema_filename=schema_filename)) - for (group in params_map.keySet()) { - def sub_params = new LinkedHashMap() - def group_params = params_map.get(group) // This gets the parameters of that particular group - for (param in group_params.keySet()) { - if (params.containsKey(param)) { - def params_value = params.get(param) - def schema_value = group_params.get(param).default - def param_type = group_params.get(param).type - if (schema_value != null) { - if (param_type == 'string') { - if (schema_value.contains('$projectDir') || schema_value.contains('${projectDir}')) { - def sub_string = schema_value.replace('\$projectDir', '') - sub_string = sub_string.replace('\${projectDir}', '') - if (params_value.contains(sub_string)) { - schema_value = params_value - } - } - if (schema_value.contains('$params.outdir') || schema_value.contains('${params.outdir}')) { - def sub_string = schema_value.replace('\$params.outdir', '') - sub_string = sub_string.replace('\${params.outdir}', '') - if ("${params.outdir}${sub_string}" == params_value) { - schema_value = params_value - } - } - } - } - - // We have a default in the schema, and this isn't it - if (schema_value != null && params_value != schema_value) { - sub_params.put(param, params_value) - } - // No default in the schema, and this isn't empty - else if (schema_value == null && params_value != "" && params_value != null && params_value != false) { - sub_params.put(param, params_value) - } - } - } - params_summary.put(group, sub_params) - } - return [ 'Core Nextflow options' : workflow_summary ] << params_summary - } - - // - // Beautify parameters for summary and return as string - // - public static String paramsSummaryLog(workflow, params) { - Map colors = NfcoreTemplate.logColours(params.monochrome_logs) - String output = '' - def params_map = paramsSummaryMap(workflow, params) - def max_chars = paramsMaxChars(params_map) - for (group in params_map.keySet()) { - def group_params = params_map.get(group) // This gets the parameters of that particular group - if (group_params) { - output += colors.bold + group + colors.reset + '\n' - for (param in group_params.keySet()) { - output += " " + colors.blue + param.padRight(max_chars) + ": " + colors.green + group_params.get(param) + colors.reset + '\n' - } - output += '\n' - } - } - output += "!! Only displaying parameters that differ from the pipeline defaults !!\n" - output += NfcoreTemplate.dashedLine(params.monochrome_logs) - return output - } - - // - // Loop over nested exceptions and print the causingException - // - private static void printExceptions(ex_json, params_json, log, enums, limit=5) { - def causingExceptions = ex_json['causingExceptions'] - if (causingExceptions.length() == 0) { - def m = ex_json['message'] =~ /required key \[([^\]]+)\] not found/ - // Missing required param - if (m.matches()) { - log.error "* Missing required parameter: --${m[0][1]}" - } - // Other base-level error - else if (ex_json['pointerToViolation'] == '#') { - log.error "* ${ex_json['message']}" - } - // Error with specific param - else { - def param = ex_json['pointerToViolation'] - ~/^#\// - def param_val = params_json[param].toString() - if (enums.containsKey(param)) { - def error_msg = "* --${param}: '${param_val}' is not a valid choice (Available choices" - if (enums[param].size() > limit) { - log.error "${error_msg} (${limit} of ${enums[param].size()}): ${enums[param][0..limit-1].join(', ')}, ... )" - } else { - log.error "${error_msg}: ${enums[param].join(', ')})" - } - } else { - log.error "* --${param}: ${ex_json['message']} (${param_val})" - } - } - } - for (ex in causingExceptions) { - printExceptions(ex, params_json, log, enums) - } - } - - // - // Remove an element from a JSONArray - // - private static JSONArray removeElement(json_array, element) { - def list = [] - int len = json_array.length() - for (int i=0;i - if(raw_schema.keySet().contains('definitions')){ - raw_schema.definitions.each { definition -> - for (key in definition.keySet()){ - if (definition[key].get("properties").keySet().contains(ignore_param)){ - // Remove the param to ignore - definition[key].get("properties").remove(ignore_param) - // If the param was required, change this - if (definition[key].has("required")) { - def cleaned_required = removeElement(definition[key].required, ignore_param) - definition[key].put("required", cleaned_required) - } - } - } - } - } - if(raw_schema.keySet().contains('properties') && raw_schema.get('properties').keySet().contains(ignore_param)) { - raw_schema.get("properties").remove(ignore_param) - } - if(raw_schema.keySet().contains('required') && raw_schema.required.contains(ignore_param)) { - def cleaned_required = removeElement(raw_schema.required, ignore_param) - raw_schema.put("required", cleaned_required) - } - } - return raw_schema - } - - // - // Clean and check parameters relative to Nextflow native classes - // - private static Map cleanParameters(params) { - def new_params = params.getClass().newInstance(params) - for (p in params) { - // remove anything evaluating to false - if (!p['value']) { - new_params.remove(p.key) - } - // Cast MemoryUnit to String - if (p['value'].getClass() == nextflow.util.MemoryUnit) { - new_params.replace(p.key, p['value'].toString()) - } - // Cast Duration to String - if (p['value'].getClass() == nextflow.util.Duration) { - new_params.replace(p.key, p['value'].toString().replaceFirst(/d(?!\S)/, "day")) - } - // Cast LinkedHashMap to String - if (p['value'].getClass() == LinkedHashMap) { - new_params.replace(p.key, p['value'].toString()) - } - } - return new_params - } - - // - // This function tries to read a JSON params file - // - private static LinkedHashMap paramsLoad(String json_schema) { - def params_map = new LinkedHashMap() - try { - params_map = paramsRead(json_schema) - } catch (Exception e) { - println "Could not read parameters settings from JSON. $e" - params_map = new LinkedHashMap() - } - return params_map - } - - // - // Method to actually read in JSON file using Groovy. - // Group (as Key), values are all parameters - // - Parameter1 as Key, Description as Value - // - Parameter2 as Key, Description as Value - // .... - // Group - // - - private static LinkedHashMap paramsRead(String json_schema) throws Exception { - def json = new File(json_schema).text - def Map schema_definitions = (Map) new JsonSlurper().parseText(json).get('definitions') - def Map schema_properties = (Map) new JsonSlurper().parseText(json).get('properties') - /* Tree looks like this in nf-core schema - * definitions <- this is what the first get('definitions') gets us - group 1 - title - description - properties - parameter 1 - type - description - parameter 2 - type - description - group 2 - title - description - properties - parameter 1 - type - description - * properties <- parameters can also be ungrouped, outside of definitions - parameter 1 - type - description - */ - - // Grouped params - def params_map = new LinkedHashMap() - schema_definitions.each { key, val -> - def Map group = schema_definitions."$key".properties // Gets the property object of the group - def title = schema_definitions."$key".title - def sub_params = new LinkedHashMap() - group.each { innerkey, value -> - sub_params.put(innerkey, value) - } - params_map.put(title, sub_params) - } - - // Ungrouped params - def ungrouped_params = new LinkedHashMap() - schema_properties.each { innerkey, value -> - ungrouped_params.put(innerkey, value) - } - params_map.put("Other parameters", ungrouped_params) - - return params_map - } - - // - // Get maximum number of characters across all parameter names - // - private static Integer paramsMaxChars(params_map) { - Integer max_chars = 0 - for (group in params_map.keySet()) { - def group_params = params_map.get(group) // This gets the parameters of that particular group - for (param in group_params.keySet()) { - if (param.size() > max_chars) { - max_chars = param.size() - } - } - } - return max_chars - } -} diff --git a/lib/NfcoreTemplate.groovy b/lib/NfcoreTemplate.groovy index 25a0a74a..e248e4c3 100755 --- a/lib/NfcoreTemplate.groovy +++ b/lib/NfcoreTemplate.groovy @@ -3,6 +3,8 @@ // import org.yaml.snakeyaml.Yaml +import groovy.json.JsonOutput +import nextflow.extension.FilesEx class NfcoreTemplate { @@ -128,7 +130,7 @@ class NfcoreTemplate { def email_html = html_template.toString() // Render the sendmail template - def max_multiqc_email_size = params.max_multiqc_email_size as nextflow.util.MemoryUnit + def max_multiqc_email_size = (params.containsKey('max_multiqc_email_size') ? params.max_multiqc_email_size : 0) as nextflow.util.MemoryUnit def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, projectDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: max_multiqc_email_size.toBytes() ] def sf = new File("$projectDir/assets/sendmail_template.txt") def sendmail_template = engine.createTemplate(sf).make(smail_fields) @@ -140,12 +142,14 @@ class NfcoreTemplate { try { if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') } // Try to send HTML e-mail using sendmail + def sendmail_tf = new File(workflow.launchDir.toString(), ".sendmail_tmp.html") + sendmail_tf.withWriter { w -> w << sendmail_html } [ 'sendmail', '-t' ].execute() << sendmail_html log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Sent summary e-mail to $email_address (sendmail)-" } catch (all) { // Catch failures and try with plaintext def mail_cmd = [ 'mail', '-s', subject, '--content-type=text/html', email_address ] - if ( mqc_report.size() <= max_multiqc_email_size.toBytes() ) { + if ( mqc_report != null && mqc_report.size() <= max_multiqc_email_size.toBytes() ) { mail_cmd += [ '-A', mqc_report ] } mail_cmd.execute() << email_html @@ -154,14 +158,16 @@ class NfcoreTemplate { } // Write summary e-mail HTML to a file - def output_d = new File("${params.outdir}/pipeline_info/") - if (!output_d.exists()) { - output_d.mkdirs() - } - def output_hf = new File(output_d, "pipeline_report.html") + def output_hf = new File(workflow.launchDir.toString(), ".pipeline_report.html") output_hf.withWriter { w -> w << email_html } - def output_tf = new File(output_d, "pipeline_report.txt") + FilesEx.copyTo(output_hf.toPath(), "${params.outdir}/pipeline_info/pipeline_report.html"); + output_hf.delete() + + // Write summary e-mail TXT to a file + def output_tf = new File(workflow.launchDir.toString(), ".pipeline_report.txt") output_tf.withWriter { w -> w << email_txt } + FilesEx.copyTo(output_tf.toPath(), "${params.outdir}/pipeline_info/pipeline_report.txt"); + output_tf.delete() } // @@ -222,6 +228,20 @@ class NfcoreTemplate { } } + // + // Dump pipeline parameters in a json file + // + public static void dump_parameters(workflow, params) { + def timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') + def filename = "params_${timestamp}.json" + def temp_pf = new File(workflow.launchDir.toString(), ".${filename}") + def jsonStr = JsonOutput.toJson(params) + temp_pf.text = JsonOutput.prettyPrint(jsonStr) + + FilesEx.copyTo(temp_pf.toPath(), "${params.outdir}/pipeline_info/params_${timestamp}.json") + temp_pf.delete() + } + // // Print pipeline summary on completion // diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 7d26a0b0..a6d4fc59 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -19,40 +19,11 @@ class WorkflowMain { " https://github.com/${workflow.manifest.name}/blob/master/CITATIONS.md" } - // - // Generate help string - // - public static String help(workflow, params) { - def command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --genome GRCh37 -profile docker" - def help_string = '' - help_string += NfcoreTemplate.logo(workflow, params.monochrome_logs) - help_string += NfcoreSchema.paramsHelp(workflow, params, command) - help_string += '\n' + citation(workflow) + '\n' - help_string += NfcoreTemplate.dashedLine(params.monochrome_logs) - return help_string - } - - // - // Generate parameter summary log string - // - public static String paramsSummaryLog(workflow, params) { - def summary_log = '' - summary_log += NfcoreTemplate.logo(workflow, params.monochrome_logs) - summary_log += NfcoreSchema.paramsSummaryLog(workflow, params) - summary_log += '\n' + citation(workflow) + '\n' - summary_log += NfcoreTemplate.dashedLine(params.monochrome_logs) - return summary_log - } // // Validate parameters and print summary to screen // public static void initialise(workflow, params, log) { - // Print help to screen if required - if (params.help) { - log.info help(workflow, params) - System.exit(0) - } // Print workflow version and exit on --version if (params.version) { @@ -61,14 +32,6 @@ class WorkflowMain { System.exit(0) } - // Print parameter summary log to screen - log.info paramsSummaryLog(workflow, params) - - // Validate workflow parameters via the JSON schema - if (params.validate_params) { - NfcoreSchema.validateParameters(workflow, params, log) - } - // Check that a -profile or Nextflow config has been provided to run the pipeline NfcoreTemplate.checkConfigProvided(workflow, log) diff --git a/main.nf b/main.nf index 62a59150..42efcd9e 100644 --- a/main.nf +++ b/main.nf @@ -17,6 +17,8 @@ nextflow.enable.dsl = 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +// This is an example of how to use getGenomeAttribute() to fetch parameters +// from igenomes.config using `--genome` params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta') /* @@ -25,6 +27,22 @@ params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta') ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +include { validateParameters; paramsHelp } from 'plugin/nf-validation' + +// Print help message if needed +if (params.help) { + def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) + def citation = '\n' + WorkflowMain.citation(workflow) + '\n' + def String command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --genome GRCh37 -profile docker" + log.info logo + paramsHelp(command) + citation + NfcoreTemplate.dashedLine(params.monochrome_logs) + System.exit(0) +} + +// Validate input parameters +if (params.validate_params) { + validateParameters() +} + WorkflowMain.initialise(workflow, params, log) /* diff --git a/modules.json b/modules.json index 6d809e90..0145244f 100644 --- a/modules.json +++ b/modules.json @@ -17,12 +17,12 @@ }, "custom/dumpsoftwareversions": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "bba7e362e4afead70653f84d8700588ea28d0f9e", "installed_by": ["modules"] }, "fastqc": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "65ad3e0b9a4099592e1102e92e10455dc661cf53", "installed_by": ["modules"] }, "minimap2/align": { @@ -32,7 +32,7 @@ }, "multiqc": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "4ab13872435962dadc239979554d13709e20bf29", "installed_by": ["modules"] }, "picard/markduplicates": { diff --git a/modules/local/ampliconsuite/ampliconsuite.nf b/modules/local/ampliconsuite/ampliconsuite.nf new file mode 100644 index 00000000..882aecfb --- /dev/null +++ b/modules/local/ampliconsuite/ampliconsuite.nf @@ -0,0 +1,95 @@ +process AMPLICONSUITE { + tag "$meta.id" + label 'process_low' + + conda "bioconda::ampliconsuite=1.2.0 mosek::mosek=10.1.21" + container 'quay.io/nf-core/prepareaa:1.0.0' + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path("*CNV_SEEDS.bed") , emit: bed + path "*.log" , emit: log + path "*run_metadata.json" , emit: run_metadata_json + path "*sample_metadata.json" , emit: sample_metadata_json + path "*timing_log.txt" , emit: timing_log + path "*cycles.txt" , emit: cycles, optional: true + path "*graph.txt" , emit: graph, optional: true + path "*edges.txt" , emit: edges, optional: true + path "*edges_cnseg.txt" , emit: edges_cnseg, optional: true + path "*.out" , emit: aa_out, optional: true + path "versions.yml" , emit: versions + path "*.png" , optional: true + path "*.pdf" , optional: true + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def cngain = params.aa_cngain + def ref = params.reference_build + """ + export AA_DATA_REPO=${params.aa_data_repo} + export MOSEKLM_LICENSE_FILE=${params.mosek_license_dir} + # Define Variables AA_SRC and AC_SRC + if ! command -v AmpliconArchitect.py &> /dev/null; then + export AA_SRC=\$(dirname \$(python -c "import ampliconarchitectlib; print(ampliconarchitectlib.__file__)")) + else + export AA_SRC=\$(dirname \$(readlink -f \$(which AmpliconArchitect.py))) + fi + + if ! command -v amplicon_classifier.py &> /dev/null; then + export AC_SRC=\$(dirname \$(python -c "import ampliconclassifierlib; print(ampliconclassifierlib.__file__)")) + else + export AC_SRC=\$(dirname \$(readlink -f \$(which amplicon_classifier.py))) + fi + REF=${params.reference_build} + + AmpliconSuite-pipeline.py \\ + $args \\ + -s $prefix \\ + -t $task.cpus \\ + --bam $bam \\ + --ref $ref \\ + --run_AA --run_AC \\ + $args + + # Move Files to base work directory + mv ${prefix}_cnvkit_output/* ./ + mv ${prefix}_AA_results/* ./ + mv ${prefix}_classification/* ./ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + AmpliconSuite-pipeline.py: \$(AmpliconSuite-pipeline.py --version | sed 's/AmpliconSuite-pipeline version //') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def cngain = params.aa_cngain + def ref = params.reference_build + """ + export AA_DATA_REPO=${params.aa_data_repo} + export MOSEKLM_LICENSE_FILE=${params.mosek_license_dir} + REF=${params.reference_build} + + touch "${prefix}_CNV_SEEDS.bed" + touch "${prefix}.log" + touch "${prefix}.run_metadata.json" + touch "${prefix}.sample_metadata.json" + touch "${prefix}.timing_log.txt" + touch "${prefix}_summary.txt" + + PrepareAA.py --help + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + AmpliconSuite-pipeline.py: \$(AmpliconSuite-pipeline.py --version | sed 's/AmpliconSuite-pipeline version //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/custom/dumpsoftwareversions/environment.yml b/modules/nf-core/custom/dumpsoftwareversions/environment.yml new file mode 100644 index 00000000..f0c63f69 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/environment.yml @@ -0,0 +1,7 @@ +name: custom_dumpsoftwareversions +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::multiqc=1.17 diff --git a/modules/nf-core/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/custom/dumpsoftwareversions/meta.yml index c32657de..5f15a5fd 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/meta.yml +++ b/modules/nf-core/custom/dumpsoftwareversions/meta.yml @@ -1,4 +1,4 @@ -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json name: custom_dumpsoftwareversions description: Custom module used to dump software versions within the nf-core pipeline template keywords: @@ -16,7 +16,6 @@ input: type: file description: YML file containing software versions pattern: "*.yml" - output: - yml: type: file @@ -30,7 +29,9 @@ output: type: file description: File containing software versions pattern: "versions.yml" - authors: - "@drpatelh" - "@grst" +maintainers: + - "@drpatelh" + - "@grst" diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test new file mode 100644 index 00000000..eec1db10 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test @@ -0,0 +1,38 @@ +nextflow_process { + + name "Test Process CUSTOM_DUMPSOFTWAREVERSIONS" + script "../main.nf" + process "CUSTOM_DUMPSOFTWAREVERSIONS" + tag "modules" + tag "modules_nfcore" + tag "custom" + tag "dumpsoftwareversions" + tag "custom/dumpsoftwareversions" + + test("Should run without failures") { + when { + process { + """ + def tool1_version = ''' + TOOL1: + tool1: 0.11.9 + '''.stripIndent() + + def tool2_version = ''' + TOOL2: + tool2: 1.9 + '''.stripIndent() + + input[0] = Channel.of(tool1_version, tool2_version).collectFile() + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap new file mode 100644 index 00000000..4274ed57 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap @@ -0,0 +1,27 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + "software_versions.yml:md5,1c851188476409cda5752ce971b20b58" + ], + "1": [ + "software_versions_mqc.yml:md5,2570f4ba271ad08357b0d3d32a9cf84d" + ], + "2": [ + "versions.yml:md5,3843ac526e762117eedf8825b40683df" + ], + "mqc_yml": [ + "software_versions_mqc.yml:md5,2570f4ba271ad08357b0d3d32a9cf84d" + ], + "versions": [ + "versions.yml:md5,3843ac526e762117eedf8825b40683df" + ], + "yml": [ + "software_versions.yml:md5,1c851188476409cda5752ce971b20b58" + ] + } + ], + "timestamp": "2023-11-03T14:43:22.157011" + } +} diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml b/modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml new file mode 100644 index 00000000..405aa24a --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml @@ -0,0 +1,2 @@ +custom/dumpsoftwareversions: + - modules/nf-core/custom/dumpsoftwareversions/** diff --git a/modules/nf-core/fastqc/environment.yml b/modules/nf-core/fastqc/environment.yml new file mode 100644 index 00000000..1787b38a --- /dev/null +++ b/modules/nf-core/fastqc/environment.yml @@ -0,0 +1,7 @@ +name: fastqc +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::fastqc=0.12.1 diff --git a/modules/nf-core/fastqc/meta.yml b/modules/nf-core/fastqc/meta.yml index 4da5bb5a..ee5507e0 100644 --- a/modules/nf-core/fastqc/meta.yml +++ b/modules/nf-core/fastqc/meta.yml @@ -50,3 +50,8 @@ authors: - "@grst" - "@ewels" - "@FelixKrueger" +maintainers: + - "@drpatelh" + - "@grst" + - "@ewels" + - "@FelixKrueger" diff --git a/modules/nf-core/fastqc/tests/main.nf.test b/modules/nf-core/fastqc/tests/main.nf.test new file mode 100644 index 00000000..b9e8f926 --- /dev/null +++ b/modules/nf-core/fastqc/tests/main.nf.test @@ -0,0 +1,109 @@ +nextflow_process { + + name "Test Process FASTQC" + script "../main.nf" + process "FASTQC" + tag "modules" + tag "modules_nfcore" + tag "fastqc" + + test("Single-Read") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id: 'test', single_end:true ], + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + // NOTE The report contains the date inside it, which means that the md5sum is stable per day, but not longer than that. So you can't md5sum it. + // looks like this:
Mon 2 Oct 2023
test.gz
+ // https://github.com/nf-core/modules/pull/3903#issuecomment-1743620039 + { assert process.out.html.get(0).get(1) ==~ ".*/test_fastqc.html" }, + { assert path(process.out.html.get(0).get(1)).getText().contains("File typeConventional base calls") }, + { assert snapshot(process.out.versions).match("versions") }, + { assert process.out.zip.get(0).get(1) ==~ ".*/test_fastqc.zip" } + ) + } + } +// TODO +// // +// // Test with paired-end data +// // +// workflow test_fastqc_paired_end { +// input = [ +// [id: 'test', single_end: false], // meta map +// [ +// file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), +// file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) +// ] +// ] + +// FASTQC ( input ) +// } + +// // +// // Test with interleaved data +// // +// workflow test_fastqc_interleaved { +// input = [ +// [id: 'test', single_end: false], // meta map +// file(params.test_data['sarscov2']['illumina']['test_interleaved_fastq_gz'], checkIfExists: true) +// ] + +// FASTQC ( input ) +// } + +// // +// // Test with bam data +// // +// workflow test_fastqc_bam { +// input = [ +// [id: 'test', single_end: false], // meta map +// file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true) +// ] + +// FASTQC ( input ) +// } + +// // +// // Test with multiple samples +// // +// workflow test_fastqc_multiple { +// input = [ +// [id: 'test', single_end: false], // meta map +// [ +// file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), +// file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true), +// file(params.test_data['sarscov2']['illumina']['test2_1_fastq_gz'], checkIfExists: true), +// file(params.test_data['sarscov2']['illumina']['test2_2_fastq_gz'], checkIfExists: true) +// ] +// ] + +// FASTQC ( input ) +// } + +// // +// // Test with custom prefix +// // +// workflow test_fastqc_custom_prefix { +// input = [ +// [ id:'mysample', single_end:true ], // meta map +// file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) +// ] + +// FASTQC ( input ) +// } +} diff --git a/modules/nf-core/fastqc/tests/main.nf.test.snap b/modules/nf-core/fastqc/tests/main.nf.test.snap new file mode 100644 index 00000000..636a32ce --- /dev/null +++ b/modules/nf-core/fastqc/tests/main.nf.test.snap @@ -0,0 +1,10 @@ +{ + "versions": { + "content": [ + [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ] + ], + "timestamp": "2023-10-09T23:40:54+0000" + } +} \ No newline at end of file diff --git a/modules/nf-core/fastqc/tests/tags.yml b/modules/nf-core/fastqc/tests/tags.yml new file mode 100644 index 00000000..7834294b --- /dev/null +++ b/modules/nf-core/fastqc/tests/tags.yml @@ -0,0 +1,2 @@ +fastqc: + - modules/nf-core/fastqc/** diff --git a/modules/nf-core/multiqc/environment.yml b/modules/nf-core/multiqc/environment.yml new file mode 100644 index 00000000..bc0bdb5b --- /dev/null +++ b/modules/nf-core/multiqc/environment.yml @@ -0,0 +1,7 @@ +name: multiqc +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::multiqc=1.18 diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml index f93b5ee5..f1aa660e 100644 --- a/modules/nf-core/multiqc/meta.yml +++ b/modules/nf-core/multiqc/meta.yml @@ -1,5 +1,5 @@ -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json -name: MultiQC +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: multiqc description: Aggregate results from bioinformatics analyses across many samples into a single report keywords: - QC @@ -13,7 +13,6 @@ tools: homepage: https://multiqc.info/ documentation: https://multiqc.info/docs/ licence: ["GPL-3.0-or-later"] - input: - multiqc_files: type: file @@ -31,7 +30,6 @@ input: type: file description: Optional logo file for MultiQC pattern: "*.{png}" - output: - report: type: file @@ -54,3 +52,8 @@ authors: - "@bunop" - "@drpatelh" - "@jfy133" +maintainers: + - "@abhi18av" + - "@bunop" + - "@drpatelh" + - "@jfy133" diff --git a/modules/nf-core/multiqc/tests/main.nf.test b/modules/nf-core/multiqc/tests/main.nf.test new file mode 100644 index 00000000..c2dad217 --- /dev/null +++ b/modules/nf-core/multiqc/tests/main.nf.test @@ -0,0 +1,63 @@ +nextflow_process { + + name "Test Process MULTIQC" + script "../main.nf" + process "MULTIQC" + tag "modules" + tag "modules_nfcore" + tag "multiqc" + + test("MULTIQC: FASTQC") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz_fastqc_zip'], checkIfExists: true)]) + input[1] = [] + input[2] = [] + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.report.get(0)).exists() }, + { assert path(process.out.data.get(0)).exists() }, + { assert path(process.out.versions.get(0)).getText().contains("multiqc") } + ) + } + + } + + test("MULTIQC: FASTQC and a config file") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz_fastqc_zip'], checkIfExists: true)]) + input[1] = Channel.of(file("https://github.com/nf-core/tools/raw/dev/nf_core/pipeline-template/assets/multiqc_config.yml", checkIfExists: true)) + input[2] = [] + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.report.get(0)).exists() }, + { assert path(process.out.data.get(0)).exists() }, + { assert path(process.out.versions.get(0)).getText().contains("multiqc") } + ) + } + + } +} diff --git a/modules/nf-core/multiqc/tests/tags.yml b/modules/nf-core/multiqc/tests/tags.yml new file mode 100644 index 00000000..bea6c0d3 --- /dev/null +++ b/modules/nf-core/multiqc/tests/tags.yml @@ -0,0 +1,2 @@ +multiqc: + - modules/nf-core/multiqc/** diff --git a/nextflow.config b/nextflow.config index 8b0f1a2e..46a78943 100644 --- a/nextflow.config +++ b/nextflow.config @@ -15,7 +15,7 @@ params { // References genome = null - igenomes_base = 's3://ngi-igenomes/igenomes' + igenomes_base = 's3://ngi-igenomes/igenomes/' igenomes_ignore = false // BWA Reference @@ -69,7 +69,6 @@ params { // Boilerplate options outdir = null - tracedir = "${params.outdir}/pipeline_info" publish_dir_mode = 'copy' email = null email_on_fail = null @@ -78,17 +77,14 @@ params { hook_url = null help = false version = false - validate_params = true - show_hidden_params = false - schema_ignore_params = 'genomes' // Config options + config_profile_name = null + config_profile_description = null custom_config_version = 'master' custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" - config_profile_description = null config_profile_contact = null config_profile_url = null - config_profile_name = null // Max resource options @@ -96,6 +92,14 @@ params { max_memory = '128.GB' max_cpus = 16 max_time = '240.h' + + // Schema validation default options + validationFailUnrecognisedParams = false + validationLenientMode = false + validationSchemaIgnoreParams = 'genomes,igenomes_base' + validationShowHiddenParams = false + validate_params = true + } // Load base.config by default for all pipelines @@ -115,13 +119,12 @@ try { // } catch (Exception e) { // System.err.println("WARNING: Could not load nf-core/config/circdna profiles: ${params.custom_config_base}/pipeline/circdna.config") // } - - profiles { debug { dumpHashes = true process.beforeScript = 'echo $HOSTNAME' - cleanup = false + cleanup = false + nextflow.enable.configProcessNamesValidation = true } conda { conda.enabled = true @@ -144,14 +147,13 @@ profiles { } docker { docker.enabled = true - docker.registry = 'quay.io' - docker.userEmulation = true conda.enabled = false singularity.enabled = false podman.enabled = false shifter.enabled = false charliecloud.enabled = false apptainer.enabled = false + docker.runOptions = '-u $(id -u):$(id -g)' } arm { docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' @@ -168,7 +170,6 @@ profiles { } podman { podman.enabled = true - podman.registry = 'quay.io' conda.enabled = false docker.enabled = false singularity.enabled = false @@ -196,6 +197,7 @@ profiles { } apptainer { apptainer.enabled = true + apptainer.autoMounts = true conda.enabled = false docker.enabled = false singularity.enabled = false @@ -205,14 +207,26 @@ profiles { } gitpod { executor.name = 'local' - executor.cpus = 16 - executor.memory = 60.GB + executor.cpus = 4 + executor.memory = 8.GB } test { includeConfig 'conf/test.config' } test_AA { includeConfig 'conf/test_AA.config' } test_full { includeConfig 'conf/test_full.config' } } +// Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile +// Will not be used unless Apptainer / Docker / Podman / Singularity are enabled +// Set to your registry if you have a mirror of containers +apptainer.registry = 'quay.io' +docker.registry = 'quay.io' +podman.registry = 'quay.io' +singularity.registry = 'quay.io' + +// Nextflow plugins +plugins { + id 'nf-validation@1.1.3' // Validation of pipeline parameters and creation of an input channel from a sample sheet +} // Load igenomes.config if required if (!params.igenomes_ignore) { @@ -220,8 +234,6 @@ if (!params.igenomes_ignore) { } else { params.genomes = [:] } - - // Export these variables to prevent local Python/R libraries from conflicting with those in the container // The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container. // See https://apeltzer.github.io/post/03-julia-lang-nextflow/ for details on that. Once we have a common agreement on where to keep Julia packages, this is adjustable. @@ -236,22 +248,26 @@ env { // Capture exit codes from upstream processes when piping process.shell = ['/bin/bash', '-euo', 'pipefail'] + +// Disable process selector warnings by default. Use debug profile to enable warnings. +nextflow.enable.configProcessNamesValidation = false def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') + timeline { enabled = true - file = "${params.tracedir}/execution_timeline_${trace_timestamp}.html" + file = "${params.outdir}/pipeline_info/execution_timeline_${trace_timestamp}.html" } report { enabled = true - file = "${params.tracedir}/execution_report_${trace_timestamp}.html" + file = "${params.outdir}/pipeline_info/execution_report_${trace_timestamp}.html" } trace { enabled = true - file = "${params.tracedir}/execution_trace_${trace_timestamp}.txt" + file = "${params.outdir}/pipeline_info/execution_trace_${trace_timestamp}.txt" } dag { enabled = true - file = "${params.tracedir}/pipeline_dag_${trace_timestamp}.html" + file = "${params.outdir}/pipeline_info/pipeline_dag_${trace_timestamp}.html" } // wave { // enabled = true @@ -264,8 +280,8 @@ manifest { homePage = 'https://github.com/nf-core/circdna' description = """Pipeline for the identification of circular DNAs""" mainScript = 'main.nf' - nextflowVersion = '!>=22.10.1' - version = '1.0.5dev' + nextflowVersion = '!>=23.04.0' + version = '1.0.5dev' doi = '10.5281/zenodo.7712010' } diff --git a/nextflow_schema.json b/nextflow_schema.json index ff153988..55a03ca8 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -15,9 +15,9 @@ "input": { "type": "string", "format": "file-path", + "exists": true, "mimetype": "text/csv", "pattern": "^\\S+\\.csv$", - "schema": "assets/schema_input.json", "description": "Path to comma-separated file containing information about the samples in the experiment.", "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with either 2 [BAM] or 3 [FASTQ] columns, and a header row. See [usage docs](https://nf-co.re/circdna/usage#samplesheet-input).", "fa_icon": "fas fa-file-csv" @@ -92,20 +92,13 @@ "fasta": { "type": "string", "format": "file-path", + "exists": true, "mimetype": "text/plain", "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", "description": "Path to FASTA genome file.", "help_text": "This parameter is *mandatory* if `--genome` is not specified. If you don't have a BWA index available this will be generated for you automatically. Combine with `--save_reference` to save BWA index for future runs.", "fa_icon": "far fa-file-code" }, - "igenomes_base": { - "type": "string", - "format": "directory-path", - "description": "Directory / URL base for iGenomes references.", - "default": "s3://ngi-igenomes/igenomes", - "fa_icon": "fas fa-cloud-download-alt", - "hidden": true - }, "igenomes_ignore": { "type": "boolean", "description": "Do not load the iGenomes reference config.", @@ -399,7 +392,7 @@ "description": "Maximum amount of time that can be requested for any single job.", "default": "240.h", "fa_icon": "far fa-clock", - "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$", + "pattern": "^(\\d+\\.?\\s*(s|m|h|d|day)\\s*)+$", "hidden": true, "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" } @@ -470,6 +463,7 @@ }, "multiqc_config": { "type": "string", + "format": "file-path", "description": "Custom config file to supply to MultiQC.", "fa_icon": "fas fa-cog", "hidden": true @@ -485,13 +479,6 @@ "description": "Custom MultiQC yaml file containing HTML including a methods description.", "fa_icon": "fas fa-cog" }, - "tracedir": { - "type": "string", - "description": "Directory to keep pipeline Nextflow logs and reports.", - "default": "${params.outdir}/pipeline_info", - "fa_icon": "fas fa-cogs", - "hidden": true - }, "validate_params": { "type": "boolean", "description": "Boolean whether to validate parameters against the schema at runtime", @@ -499,12 +486,26 @@ "fa_icon": "fas fa-check-square", "hidden": true }, - "show_hidden_params": { + "validationShowHiddenParams": { "type": "boolean", "fa_icon": "far fa-eye-slash", "description": "Show all params when using `--help`", "hidden": true, "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." + }, + "validationFailUnrecognisedParams": { + "type": "boolean", + "fa_icon": "far fa-check-circle", + "description": "Validation of parameters fails when an unrecognised parameter is found.", + "hidden": true, + "help_text": "By default, when an unrecognised parameter is found, it returns a warinig." + }, + "validationLenientMode": { + "type": "boolean", + "fa_icon": "far fa-check-circle", + "description": "Validation of parameters in lenient more.", + "hidden": true, + "help_text": "Allows string values that are parseable as numbers or booleans. For further information see [JSONSchema docs](https://github.com/everit-org/json-schema#lenient-mode)." } } } diff --git a/workflows/circdna.nf b/workflows/circdna.nf index 11a7d0cc..5f2f4e8f 100644 --- a/workflows/circdna.nf +++ b/workflows/circdna.nf @@ -1,21 +1,19 @@ /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - VALIDATE INPUTS + PRINT PARAMS SUMMARY ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +include { paramsSummaryLog; paramsSummaryMap } from 'plugin/nf-validation' -def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) +def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) +def citation = '\n' + WorkflowMain.citation(workflow) + '\n' +def summary_params = paramsSummaryMap(workflow) -// Validate input parameters +// Print parameter summary log to screen +log.info logo + paramsSummaryLog(workflow) + citation WorkflowCircdna.initialise(params, log) -// Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.multiqc_config, params.fasta ] -for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } - -// Check mandatory parameters -if (params.input) { ch_input = Channel.fromPath(params.input) } else { exit 1, 'Input samplesheet not specified!' } if (params.fasta) { ch_fasta = Channel.fromPath(params.fasta) } else { exit 1, 'Fasta reference genome not specified!' } if (!(params.input_format == "FASTQ" | params.input_format == "BAM")) { @@ -41,7 +39,7 @@ if (run_unicycler && !params.input_format == "FASTQ") { exit 1, 'Unicycler needs FastQ input. Please specify input_format == "FASTQ", if possible, or don`t run unicycler.' } -if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } +if (!params.input) { exit 1, 'Input samplesheet not specified!' } // Check if BWA Index is given if (params.bwa_index) { @@ -189,7 +187,7 @@ workflow CIRCDNA { // SUBWORKFLOW: Read in samplesheet, validate and stage input files // INPUT_CHECK ( - ch_input + file(params.input) ) .reads .map { @@ -287,7 +285,7 @@ workflow CIRCDNA { } else if (params.input_format == "BAM") { // Use BAM Files as input INPUT_CHECK ( - ch_input + file(params.input) ) if (!params.bam_sorted){ SAMTOOLS_SORT_BAM ( @@ -535,9 +533,16 @@ workflow CIRCDNA { // MODULE: MultiQC // if (!params.skip_multiqc) { - workflow_summary = WorkflowCircdna.paramsSummaryMultiqc(workflow, summary_params) + workflow_summary = WorkflowCircdna.paramsSummaryMultiqc(workflow, summary_params) ch_workflow_summary = Channel.value(workflow_summary) + methods_description = WorkflowCircdna.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description, params) + ch_methods_description = Channel.value(methods_description) + ch_multiqc_files = Channel.empty() + ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) + ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) + ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) + MULTIQC ( ch_multiqc_config, ch_multiqc_custom_config.collect().ifEmpty([]), @@ -569,6 +574,7 @@ workflow.onComplete { if (params.email || params.email_on_fail) { NfcoreTemplate.email(workflow, params, summary_params, projectDir, log, multiqc_report) } + NfcoreTemplate.dump_parameters(workflow, params) NfcoreTemplate.summary(workflow, params, log) if (params.hook_url) { NfcoreTemplate.IM_notification(workflow, params, summary_params, projectDir, log) From 4619740e6b4c84fd205ae25029ee9bf00e478db1 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Sat, 6 Jan 2024 10:12:50 +0000 Subject: [PATCH 24/48] modules and subworkflows update to 2.11.2 --- modules.json | 40 +-- modules/nf-core/bwa/index/environment.yml | 7 + modules/nf-core/bwa/index/main.nf | 18 +- modules/nf-core/bwa/index/meta.yml | 3 + modules/nf-core/bwa/index/tests/main.nf.test | 33 +++ .../nf-core/bwa/index/tests/main.nf.test.snap | 43 +++ modules/nf-core/bwa/index/tests/tags.yml | 2 + modules/nf-core/cat/fastq/environment.yml | 7 + modules/nf-core/cat/fastq/main.nf | 2 +- modules/nf-core/cat/fastq/meta.yml | 4 +- modules/nf-core/cat/fastq/tests/main.nf.test | 143 +++++++++ .../nf-core/cat/fastq/tests/main.nf.test.snap | 78 +++++ modules/nf-core/cat/fastq/tests/tags.yml | 2 + .../custom/dumpsoftwareversions/main.nf | 6 +- .../dumpsoftwareversions/tests/main.nf.test | 7 +- .../tests/main.nf.test.snap | 50 ++-- modules/nf-core/fastqc/main.nf | 16 +- modules/nf-core/fastqc/tests/main.nf.test | 271 ++++++++++++------ .../nf-core/fastqc/tests/main.nf.test.snap | 12 +- .../nf-core/minimap2/align/environment.yml | 8 + modules/nf-core/minimap2/align/main.nf | 8 +- modules/nf-core/minimap2/align/meta.yml | 10 + .../nf-core/minimap2/align/tests/main.nf.test | 145 ++++++++++ .../minimap2/align/tests/main.nf.test.snap | 38 +++ modules/nf-core/minimap2/align/tests/tags.yml | 2 + modules/nf-core/multiqc/main.nf | 10 +- modules/nf-core/multiqc/meta.yml | 1 - modules/nf-core/multiqc/tests/main.nf.test | 48 +++- .../nf-core/multiqc/tests/main.nf.test.snap | 21 ++ .../picard/markduplicates/environment.yml | 7 + modules/nf-core/picard/markduplicates/main.nf | 10 +- .../nf-core/picard/markduplicates/meta.yml | 4 + .../picard/markduplicates/tests/main.nf.test | 111 +++++++ .../markduplicates/tests/main.nf.test.snap | 44 +++ .../markduplicates/tests/nextflow.config | 6 + .../picard/markduplicates/tests/tags.yml | 2 + .../nf-core/samtools/faidx/environment.yml | 7 + modules/nf-core/samtools/faidx/main.nf | 10 +- modules/nf-core/samtools/faidx/meta.yml | 4 + .../nf-core/samtools/flagstat/environment.yml | 7 + modules/nf-core/samtools/flagstat/main.nf | 17 +- modules/nf-core/samtools/flagstat/meta.yml | 2 + .../samtools/flagstat/tests/main.nf.test | 36 +++ .../samtools/flagstat/tests/main.nf.test.snap | 16 ++ .../nf-core/samtools/flagstat/tests/tags.yml | 2 + .../nf-core/samtools/idxstats/environment.yml | 7 + modules/nf-core/samtools/idxstats/main.nf | 18 +- modules/nf-core/samtools/idxstats/meta.yml | 2 + .../samtools/idxstats/tests/main.nf.test | 36 +++ .../samtools/idxstats/tests/main.nf.test.snap | 16 ++ .../nf-core/samtools/idxstats/tests/tags.yml | 2 + .../nf-core/samtools/index/environment.yml | 7 + modules/nf-core/samtools/index/main.nf | 6 +- modules/nf-core/samtools/index/meta.yml | 4 + .../samtools/index/tests/csi.nextflow.config | 7 + .../nf-core/samtools/index/tests/main.nf.test | 87 ++++++ .../samtools/index/tests/main.nf.test.snap | 28 ++ modules/nf-core/samtools/index/tests/tags.yml | 2 + modules/nf-core/samtools/sort/environment.yml | 7 + modules/nf-core/samtools/sort/main.nf | 8 +- modules/nf-core/samtools/sort/meta.yml | 3 + .../nf-core/samtools/sort/tests/main.nf.test | 73 +++++ .../samtools/sort/tests/main.nf.test.snap | 48 ++++ .../samtools/sort/tests/nextflow.config | 7 + modules/nf-core/samtools/sort/tests/tags.yml | 3 + .../nf-core/samtools/stats/environment.yml | 7 + modules/nf-core/samtools/stats/main.nf | 6 +- modules/nf-core/samtools/stats/meta.yml | 4 + .../nf-core/samtools/stats/tests/main.nf.test | 78 +++++ .../samtools/stats/tests/main.nf.test.snap | 64 +++++ modules/nf-core/samtools/stats/tests/tags.yml | 2 + modules/nf-core/samtools/view/environment.yml | 7 + modules/nf-core/samtools/view/main.nf | 19 +- modules/nf-core/samtools/view/meta.yml | 5 + .../nf-core/samtools/view/tests/bam.config | 3 + .../samtools/view/tests/bam_index.config | 3 + .../nf-core/samtools/view/tests/main.nf.test | 231 +++++++++++++++ .../samtools/view/tests/main.nf.test.snap | 140 +++++++++ modules/nf-core/samtools/view/tests/tags.yml | 2 + modules/nf-core/trimgalore/environment.yml | 7 + modules/nf-core/trimgalore/main.nf | 2 +- modules/nf-core/trimgalore/meta.yml | 4 + modules/nf-core/trimgalore/tests/main.nf.test | 105 +++++++ .../trimgalore/tests/main.nf.test.snap | 148 ++++++++++ modules/nf-core/trimgalore/tests/tags.yml | 2 + .../bam_markduplicates_picard/meta.yml | 8 +- .../tests/main.nf.test | 92 ++++++ .../tests/main.nf.test.snap | 22 ++ .../bam_markduplicates_picard/tests/tags.yml | 2 + .../nf-core/bam_stats_samtools/meta.yml | 4 +- .../bam_stats_samtools/tests/main.nf.test | 102 +++++++ .../tests/main.nf.test.snap | 128 +++++++++ .../nf-core/bam_stats_samtools/tests/tags.yml | 2 + 93 files changed, 2691 insertions(+), 199 deletions(-) create mode 100644 modules/nf-core/bwa/index/environment.yml create mode 100644 modules/nf-core/bwa/index/tests/main.nf.test create mode 100644 modules/nf-core/bwa/index/tests/main.nf.test.snap create mode 100644 modules/nf-core/bwa/index/tests/tags.yml create mode 100644 modules/nf-core/cat/fastq/environment.yml create mode 100644 modules/nf-core/cat/fastq/tests/main.nf.test create mode 100644 modules/nf-core/cat/fastq/tests/main.nf.test.snap create mode 100644 modules/nf-core/cat/fastq/tests/tags.yml create mode 100644 modules/nf-core/minimap2/align/environment.yml create mode 100644 modules/nf-core/minimap2/align/tests/main.nf.test create mode 100644 modules/nf-core/minimap2/align/tests/main.nf.test.snap create mode 100644 modules/nf-core/minimap2/align/tests/tags.yml create mode 100644 modules/nf-core/multiqc/tests/main.nf.test.snap create mode 100644 modules/nf-core/picard/markduplicates/environment.yml create mode 100644 modules/nf-core/picard/markduplicates/tests/main.nf.test create mode 100644 modules/nf-core/picard/markduplicates/tests/main.nf.test.snap create mode 100644 modules/nf-core/picard/markduplicates/tests/nextflow.config create mode 100644 modules/nf-core/picard/markduplicates/tests/tags.yml create mode 100644 modules/nf-core/samtools/faidx/environment.yml create mode 100644 modules/nf-core/samtools/flagstat/environment.yml create mode 100644 modules/nf-core/samtools/flagstat/tests/main.nf.test create mode 100644 modules/nf-core/samtools/flagstat/tests/main.nf.test.snap create mode 100644 modules/nf-core/samtools/flagstat/tests/tags.yml create mode 100644 modules/nf-core/samtools/idxstats/environment.yml create mode 100644 modules/nf-core/samtools/idxstats/tests/main.nf.test create mode 100644 modules/nf-core/samtools/idxstats/tests/main.nf.test.snap create mode 100644 modules/nf-core/samtools/idxstats/tests/tags.yml create mode 100644 modules/nf-core/samtools/index/environment.yml create mode 100644 modules/nf-core/samtools/index/tests/csi.nextflow.config create mode 100644 modules/nf-core/samtools/index/tests/main.nf.test create mode 100644 modules/nf-core/samtools/index/tests/main.nf.test.snap create mode 100644 modules/nf-core/samtools/index/tests/tags.yml create mode 100644 modules/nf-core/samtools/sort/environment.yml create mode 100644 modules/nf-core/samtools/sort/tests/main.nf.test create mode 100644 modules/nf-core/samtools/sort/tests/main.nf.test.snap create mode 100644 modules/nf-core/samtools/sort/tests/nextflow.config create mode 100644 modules/nf-core/samtools/sort/tests/tags.yml create mode 100644 modules/nf-core/samtools/stats/environment.yml create mode 100644 modules/nf-core/samtools/stats/tests/main.nf.test create mode 100644 modules/nf-core/samtools/stats/tests/main.nf.test.snap create mode 100644 modules/nf-core/samtools/stats/tests/tags.yml create mode 100644 modules/nf-core/samtools/view/environment.yml create mode 100644 modules/nf-core/samtools/view/tests/bam.config create mode 100644 modules/nf-core/samtools/view/tests/bam_index.config create mode 100644 modules/nf-core/samtools/view/tests/main.nf.test create mode 100644 modules/nf-core/samtools/view/tests/main.nf.test.snap create mode 100644 modules/nf-core/samtools/view/tests/tags.yml create mode 100644 modules/nf-core/trimgalore/environment.yml create mode 100644 modules/nf-core/trimgalore/tests/main.nf.test create mode 100644 modules/nf-core/trimgalore/tests/main.nf.test.snap create mode 100644 modules/nf-core/trimgalore/tests/tags.yml create mode 100644 subworkflows/nf-core/bam_markduplicates_picard/tests/main.nf.test create mode 100644 subworkflows/nf-core/bam_markduplicates_picard/tests/main.nf.test.snap create mode 100644 subworkflows/nf-core/bam_markduplicates_picard/tests/tags.yml create mode 100644 subworkflows/nf-core/bam_stats_samtools/tests/main.nf.test create mode 100644 subworkflows/nf-core/bam_stats_samtools/tests/main.nf.test.snap create mode 100644 subworkflows/nf-core/bam_stats_samtools/tests/tags.yml diff --git a/modules.json b/modules.json index 0145244f..8f3be88a 100644 --- a/modules.json +++ b/modules.json @@ -7,77 +7,77 @@ "nf-core": { "bwa/index": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "cat/fastq": { "branch": "master", - "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "custom/dumpsoftwareversions": { "branch": "master", - "git_sha": "bba7e362e4afead70653f84d8700588ea28d0f9e", + "git_sha": "37dee863936732fe7e05dc598bf6e183a8e7ef73", "installed_by": ["modules"] }, "fastqc": { "branch": "master", - "git_sha": "65ad3e0b9a4099592e1102e92e10455dc661cf53", + "git_sha": "617777a807a1770f73deb38c80004bac06807eef", "installed_by": ["modules"] }, "minimap2/align": { "branch": "master", - "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", "installed_by": ["modules"] }, "multiqc": { "branch": "master", - "git_sha": "4ab13872435962dadc239979554d13709e20bf29", + "git_sha": "642a0d8afe373ac45244a7947fb8a6c0a5a312d4", "installed_by": ["modules"] }, "picard/markduplicates": { "branch": "master", - "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", + "git_sha": "20b0918591d4ba20047d7e13e5094bcceba81447", "installed_by": ["bam_markduplicates_picard", "modules"] }, "samtools/faidx": { "branch": "master", - "git_sha": "bf8ff98531167f8245ba5c44ce7d781503ddf936", + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", "installed_by": ["modules"] }, "samtools/flagstat": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules", "bam_stats_samtools"] + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", + "installed_by": ["bam_stats_samtools", "modules"] }, "samtools/idxstats": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules", "bam_stats_samtools"] + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", + "installed_by": ["bam_stats_samtools", "modules"] }, "samtools/index": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", "installed_by": ["bam_markduplicates_picard", "modules"] }, "samtools/sort": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", "installed_by": ["modules"] }, "samtools/stats": { "branch": "master", - "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", - "installed_by": ["modules", "bam_stats_samtools"] + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", + "installed_by": ["bam_stats_samtools", "modules"] }, "samtools/view": { "branch": "master", - "git_sha": "3ffae3598260a99e8db3207dead9f73f87f90d1f", + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", "installed_by": ["modules"] }, "trimgalore": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] } } @@ -86,12 +86,12 @@ "nf-core": { "bam_markduplicates_picard": { "branch": "master", - "git_sha": "a9784afdd5dcda23b84e64db75dc591065d64653", + "git_sha": "eeb9d37c6c8b0ab864b8fe68aa6531c5b2beba01", "installed_by": ["subworkflows"] }, "bam_stats_samtools": { "branch": "master", - "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", + "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", "installed_by": ["bam_markduplicates_picard", "subworkflows"] } } diff --git a/modules/nf-core/bwa/index/environment.yml b/modules/nf-core/bwa/index/environment.yml new file mode 100644 index 00000000..5d3cb323 --- /dev/null +++ b/modules/nf-core/bwa/index/environment.yml @@ -0,0 +1,7 @@ +name: bwa_index +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bwa=0.7.17 diff --git a/modules/nf-core/bwa/index/main.nf b/modules/nf-core/bwa/index/main.nf index 8d2e56d9..24b5a2ea 100644 --- a/modules/nf-core/bwa/index/main.nf +++ b/modules/nf-core/bwa/index/main.nf @@ -2,7 +2,7 @@ process BWA_INDEX { tag "$fasta" label 'process_single' - conda "bioconda::bwa=0.7.17" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/bwa:0.7.17--hed695b0_7' : 'biocontainers/bwa:0.7.17--hed695b0_7' }" @@ -18,13 +18,14 @@ process BWA_INDEX { task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${fasta.baseName}" + def args = task.ext.args ?: '' """ mkdir bwa bwa \\ index \\ $args \\ - -p bwa/${fasta.baseName} \\ + -p bwa/${prefix} \\ $fasta cat <<-END_VERSIONS > versions.yml @@ -34,14 +35,15 @@ process BWA_INDEX { """ stub: + def prefix = task.ext.prefix ?: "${fasta.baseName}" """ mkdir bwa - touch bwa/genome.amb - touch bwa/genome.ann - touch bwa/genome.bwt - touch bwa/genome.pac - touch bwa/genome.sa + touch bwa/${prefix}.amb + touch bwa/${prefix}.ann + touch bwa/${prefix}.bwt + touch bwa/${prefix}.pac + touch bwa/${prefix}.sa cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/bwa/index/meta.yml b/modules/nf-core/bwa/index/meta.yml index 2c6cfcd7..730628d0 100644 --- a/modules/nf-core/bwa/index/meta.yml +++ b/modules/nf-core/bwa/index/meta.yml @@ -40,3 +40,6 @@ output: authors: - "@drpatelh" - "@maxulysse" +maintainers: + - "@drpatelh" + - "@maxulysse" diff --git a/modules/nf-core/bwa/index/tests/main.nf.test b/modules/nf-core/bwa/index/tests/main.nf.test new file mode 100644 index 00000000..5fc8d496 --- /dev/null +++ b/modules/nf-core/bwa/index/tests/main.nf.test @@ -0,0 +1,33 @@ +nextflow_process { + + name "Test Process BWA_INDEX" + tag "modules_nfcore" + tag "modules" + tag "bwa" + tag "bwa/index" + script "../main.nf" + process "BWA_INDEX" + + test("BWA index") { + + when { + process { + """ + input[0] = [ + [id: 'test'], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/bwa/index/tests/main.nf.test.snap b/modules/nf-core/bwa/index/tests/main.nf.test.snap new file mode 100644 index 00000000..e51ad5bf --- /dev/null +++ b/modules/nf-core/bwa/index/tests/main.nf.test.snap @@ -0,0 +1,43 @@ +{ + "BWA index": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + [ + "genome.amb:md5,3a68b8b2287e07dd3f5f95f4344ba76e", + "genome.ann:md5,c32e11f6c859f166c7525a9c1d583567", + "genome.bwt:md5,0469c30a1e239dd08f68afe66fde99da", + "genome.pac:md5,983e3d2cd6f36e2546e6d25a0da78d66", + "genome.sa:md5,ab3952cabf026b48cd3eb5bccbb636d1" + ] + ] + ], + "1": [ + "versions.yml:md5,0f20525da90e7489a7ebb02adca3265f" + ], + "index": [ + [ + { + "id": "test" + }, + [ + "genome.amb:md5,3a68b8b2287e07dd3f5f95f4344ba76e", + "genome.ann:md5,c32e11f6c859f166c7525a9c1d583567", + "genome.bwt:md5,0469c30a1e239dd08f68afe66fde99da", + "genome.pac:md5,983e3d2cd6f36e2546e6d25a0da78d66", + "genome.sa:md5,ab3952cabf026b48cd3eb5bccbb636d1" + ] + ] + ], + "versions": [ + "versions.yml:md5,0f20525da90e7489a7ebb02adca3265f" + ] + } + ], + "timestamp": "2023-10-17T17:20:20.180927714" + } +} \ No newline at end of file diff --git a/modules/nf-core/bwa/index/tests/tags.yml b/modules/nf-core/bwa/index/tests/tags.yml new file mode 100644 index 00000000..28bb483c --- /dev/null +++ b/modules/nf-core/bwa/index/tests/tags.yml @@ -0,0 +1,2 @@ +bwa/index: + - modules/nf-core/bwa/index/** diff --git a/modules/nf-core/cat/fastq/environment.yml b/modules/nf-core/cat/fastq/environment.yml new file mode 100644 index 00000000..bff93add --- /dev/null +++ b/modules/nf-core/cat/fastq/environment.yml @@ -0,0 +1,7 @@ +name: cat_fastq +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::sed=4.7 diff --git a/modules/nf-core/cat/fastq/main.nf b/modules/nf-core/cat/fastq/main.nf index 5021e6fc..3d963784 100644 --- a/modules/nf-core/cat/fastq/main.nf +++ b/modules/nf-core/cat/fastq/main.nf @@ -2,7 +2,7 @@ process CAT_FASTQ { tag "$meta.id" label 'process_single' - conda "conda-forge::sed=4.7" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : 'nf-core/ubuntu:20.04' }" diff --git a/modules/nf-core/cat/fastq/meta.yml b/modules/nf-core/cat/fastq/meta.yml index 8a39e309..db4ac3c7 100644 --- a/modules/nf-core/cat/fastq/meta.yml +++ b/modules/nf-core/cat/fastq/meta.yml @@ -34,7 +34,9 @@ output: type: file description: File containing software versions pattern: "versions.yml" - authors: - "@joseespinosa" - "@drpatelh" +maintainers: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/cat/fastq/tests/main.nf.test b/modules/nf-core/cat/fastq/tests/main.nf.test new file mode 100644 index 00000000..f5f94182 --- /dev/null +++ b/modules/nf-core/cat/fastq/tests/main.nf.test @@ -0,0 +1,143 @@ +nextflow_process { + + name "Test Process CAT_FASTQ" + script "../main.nf" + process "CAT_FASTQ" + tag "modules" + tag "modules_nfcore" + tag "cat" + tag "cat/fastq" + + test("test_cat_fastq_single_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test2_1_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.reads).match() }, + { assert path(process.out.versions.get(0)).getText().contains("cat") } + ) + } + } + + test("test_cat_fastq_paired_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test2_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test2_2_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.reads).match() }, + { assert path(process.out.versions.get(0)).getText().contains("cat") } + ) + } + } + + test("test_cat_fastq_single_end_same_name") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.reads).match() }, + { assert path(process.out.versions.get(0)).getText().contains("cat") } + ) + } + } + + test("test_cat_fastq_paired_end_same_name") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.reads).match() }, + { assert path(process.out.versions.get(0)).getText().contains("cat") } + ) + } + } + + test("test_cat_fastq_single_end_single_file") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true)] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.reads).match() }, + { assert path(process.out.versions.get(0)).getText().contains("cat") } + ) + } + } +} diff --git a/modules/nf-core/cat/fastq/tests/main.nf.test.snap b/modules/nf-core/cat/fastq/tests/main.nf.test.snap new file mode 100644 index 00000000..ec2342e5 --- /dev/null +++ b/modules/nf-core/cat/fastq/tests/main.nf.test.snap @@ -0,0 +1,78 @@ +{ + "test_cat_fastq_single_end": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,f9cf5e375f7de81a406144a2c70cc64d" + ] + ] + ], + "timestamp": "2023-10-17T23:19:12.990284837" + }, + "test_cat_fastq_single_end_same_name": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,63f817db7a29a03eb538104495556f66" + ] + ] + ], + "timestamp": "2023-10-17T23:19:31.554568147" + }, + "test_cat_fastq_single_end_single_file": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,e325ef7deb4023447a1f074e285761af" + ] + ] + ], + "timestamp": "2023-10-17T23:19:49.629360033" + }, + "test_cat_fastq_paired_end_same_name": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.merged.fastq.gz:md5,63f817db7a29a03eb538104495556f66", + "test_2.merged.fastq.gz:md5,fe9f266f43a6fc3dcab690a18419a56e" + ] + ] + ] + ], + "timestamp": "2023-10-17T23:19:40.711617539" + }, + "test_cat_fastq_paired_end": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.merged.fastq.gz:md5,f9cf5e375f7de81a406144a2c70cc64d", + "test_2.merged.fastq.gz:md5,77c8e966e130d8c6b6ec9be52fcb2bda" + ] + ] + ] + ], + "timestamp": "2023-10-18T07:53:20.923560211" + } +} \ No newline at end of file diff --git a/modules/nf-core/cat/fastq/tests/tags.yml b/modules/nf-core/cat/fastq/tests/tags.yml new file mode 100644 index 00000000..6ac43614 --- /dev/null +++ b/modules/nf-core/cat/fastq/tests/tags.yml @@ -0,0 +1,2 @@ +cat/fastq: + - modules/nf-core/cat/fastq/** diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf index ebc87273..7685b33c 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -2,10 +2,10 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { label 'process_single' // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container - conda "bioconda::multiqc=1.14" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : - 'biocontainers/multiqc:1.14--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_0' : + 'biocontainers/multiqc:1.17--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test index eec1db10..b1e1630b 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test @@ -31,7 +31,12 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert snapshot(process.out).match() } + { assert snapshot( + process.out.versions, + file(process.out.mqc_yml[0]).readLines()[0..10], + file(process.out.yml[0]).readLines()[0..7] + ).match() + } ) } } diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap index 4274ed57..29e72446 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap @@ -1,27 +1,33 @@ { "Should run without failures": { "content": [ - { - "0": [ - "software_versions.yml:md5,1c851188476409cda5752ce971b20b58" - ], - "1": [ - "software_versions_mqc.yml:md5,2570f4ba271ad08357b0d3d32a9cf84d" - ], - "2": [ - "versions.yml:md5,3843ac526e762117eedf8825b40683df" - ], - "mqc_yml": [ - "software_versions_mqc.yml:md5,2570f4ba271ad08357b0d3d32a9cf84d" - ], - "versions": [ - "versions.yml:md5,3843ac526e762117eedf8825b40683df" - ], - "yml": [ - "software_versions.yml:md5,1c851188476409cda5752ce971b20b58" - ] - } + [ + "versions.yml:md5,3843ac526e762117eedf8825b40683df" + ], + [ + "data: \"\\n\\n \\n \\n \\n \\n \\n \\n \\n\\", + " \\n\\n\\n \\n \\n\\", + " \\ \\n\\n\\n\\n \\n \\", + " \\ \\n \\n\\n\\n\\n\\", + " \\n\\n \\n \\n\\", + " \\ \\n\\n\\n\\n\\n\\n \\n\\", + " \\ \\n \\n\\n\\n\\n\\", + " \\n\\n \\n \\n\\" + ], + [ + "CUSTOM_DUMPSOFTWAREVERSIONS:", + " python: 3.12.0", + " yaml: 6.0.1", + "TOOL1:", + " tool1: 0.11.9", + "TOOL2:", + " tool2: '1.9'", + "Workflow:" + ] ], - "timestamp": "2023-11-03T14:43:22.157011" + "timestamp": "2024-01-05T00:18:43.461970077" } -} +} \ No newline at end of file diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf index 07d5e433..9e19a74c 100644 --- a/modules/nf-core/fastqc/main.nf +++ b/modules/nf-core/fastqc/main.nf @@ -2,10 +2,10 @@ process FASTQC { tag "$meta.id" label 'process_medium' - conda "bioconda::fastqc=0.11.9" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/fastqc:0.11.9--0' : - 'biocontainers/fastqc:0.11.9--0' }" + 'https://depot.galaxyproject.org/singularity/fastqc:0.12.1--hdfd78af_0' : + 'biocontainers/fastqc:0.12.1--hdfd78af_0' }" input: tuple val(meta), path(reads) @@ -29,11 +29,15 @@ process FASTQC { printf "%s %s\\n" $rename_to | while read old_name new_name; do [ -f "\${new_name}" ] || ln -s \$old_name \$new_name done - fastqc $args --threads $task.cpus $renamed_files + + fastqc \\ + $args \\ + --threads $task.cpus \\ + $renamed_files cat <<-END_VERSIONS > versions.yml "${task.process}": - fastqc: \$( fastqc --version | sed -e "s/FastQC v//g" ) + fastqc: \$( fastqc --version | sed '/FastQC v/!d; s/.*v//' ) END_VERSIONS """ @@ -45,7 +49,7 @@ process FASTQC { cat <<-END_VERSIONS > versions.yml "${task.process}": - fastqc: \$( fastqc --version | sed -e "s/FastQC v//g" ) + fastqc: \$( fastqc --version | sed '/FastQC v/!d; s/.*v//' ) END_VERSIONS """ } diff --git a/modules/nf-core/fastqc/tests/main.nf.test b/modules/nf-core/fastqc/tests/main.nf.test index b9e8f926..ad9bc54f 100644 --- a/modules/nf-core/fastqc/tests/main.nf.test +++ b/modules/nf-core/fastqc/tests/main.nf.test @@ -3,23 +3,21 @@ nextflow_process { name "Test Process FASTQC" script "../main.nf" process "FASTQC" + tag "modules" tag "modules_nfcore" tag "fastqc" - test("Single-Read") { + test("sarscov2 single-end [fastq]") { when { - params { - outdir = "$outputDir" - } process { """ input[0] = [ - [ id: 'test', single_end:true ], - [ - file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) - ] + [ id: 'test', single_end:true ], + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] ] """ } @@ -28,82 +26,195 @@ nextflow_process { then { assertAll ( { assert process.success }, + // NOTE The report contains the date inside it, which means that the md5sum is stable per day, but not longer than that. So you can't md5sum it. // looks like this:
Mon 2 Oct 2023
test.gz
// https://github.com/nf-core/modules/pull/3903#issuecomment-1743620039 - { assert process.out.html.get(0).get(1) ==~ ".*/test_fastqc.html" }, - { assert path(process.out.html.get(0).get(1)).getText().contains("") }, - { assert snapshot(process.out.versions).match("versions") }, - { assert process.out.zip.get(0).get(1) ==~ ".*/test_fastqc.zip" } + + { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, + { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, + { assert path(process.out.html[0][1]).text.contains("") }, + + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("sarscov2 paired-end [fastq]") { + + when { + process { + """ + input[0] = [ + [id: 'test', single_end: false], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + + { assert process.out.html[0][1][0] ==~ ".*/test_1_fastqc.html" }, + { assert process.out.html[0][1][1] ==~ ".*/test_2_fastqc.html" }, + { assert process.out.zip[0][1][0] ==~ ".*/test_1_fastqc.zip" }, + { assert process.out.zip[0][1][1] ==~ ".*/test_2_fastqc.zip" }, + { assert path(process.out.html[0][1][0]).text.contains("") }, + { assert path(process.out.html[0][1][1]).text.contains("") }, + + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("sarscov2 interleaved [fastq]") { + + when { + process { + """ + input[0] = [ + [id: 'test', single_end: false], // meta map + file(params.test_data['sarscov2']['illumina']['test_interleaved_fastq_gz'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + + { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, + { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, + { assert path(process.out.html[0][1]).text.contains("") }, + + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("sarscov2 paired-end [bam]") { + + when { + process { + """ + input[0] = [ + [id: 'test', single_end: false], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + + { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, + { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, + { assert path(process.out.html[0][1]).text.contains("") }, + + { assert snapshot(process.out.versions).match("versions") } ) } } -// TODO -// // -// // Test with paired-end data -// // -// workflow test_fastqc_paired_end { -// input = [ -// [id: 'test', single_end: false], // meta map -// [ -// file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), -// file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) -// ] -// ] - -// FASTQC ( input ) -// } - -// // -// // Test with interleaved data -// // -// workflow test_fastqc_interleaved { -// input = [ -// [id: 'test', single_end: false], // meta map -// file(params.test_data['sarscov2']['illumina']['test_interleaved_fastq_gz'], checkIfExists: true) -// ] - -// FASTQC ( input ) -// } - -// // -// // Test with bam data -// // -// workflow test_fastqc_bam { -// input = [ -// [id: 'test', single_end: false], // meta map -// file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true) -// ] - -// FASTQC ( input ) -// } - -// // -// // Test with multiple samples -// // -// workflow test_fastqc_multiple { -// input = [ -// [id: 'test', single_end: false], // meta map -// [ -// file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), -// file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true), -// file(params.test_data['sarscov2']['illumina']['test2_1_fastq_gz'], checkIfExists: true), -// file(params.test_data['sarscov2']['illumina']['test2_2_fastq_gz'], checkIfExists: true) -// ] -// ] - -// FASTQC ( input ) -// } - -// // -// // Test with custom prefix -// // -// workflow test_fastqc_custom_prefix { -// input = [ -// [ id:'mysample', single_end:true ], // meta map -// file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) -// ] - -// FASTQC ( input ) -// } + + test("sarscov2 multiple [fastq]") { + + when { + process { + """ + input[0] = [ + [id: 'test', single_end: false], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test2_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test2_2_fastq_gz'], checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + + { assert process.out.html[0][1][0] ==~ ".*/test_1_fastqc.html" }, + { assert process.out.html[0][1][1] ==~ ".*/test_2_fastqc.html" }, + { assert process.out.html[0][1][2] ==~ ".*/test_3_fastqc.html" }, + { assert process.out.html[0][1][3] ==~ ".*/test_4_fastqc.html" }, + { assert process.out.zip[0][1][0] ==~ ".*/test_1_fastqc.zip" }, + { assert process.out.zip[0][1][1] ==~ ".*/test_2_fastqc.zip" }, + { assert process.out.zip[0][1][2] ==~ ".*/test_3_fastqc.zip" }, + { assert process.out.zip[0][1][3] ==~ ".*/test_4_fastqc.zip" }, + { assert path(process.out.html[0][1][0]).text.contains("") }, + { assert path(process.out.html[0][1][1]).text.contains("") }, + { assert path(process.out.html[0][1][2]).text.contains("") }, + { assert path(process.out.html[0][1][3]).text.contains("") }, + + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("sarscov2 custom_prefix") { + + when { + process { + """ + input[0] = [ + [ id:'mysample', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + + { assert process.out.html[0][1] ==~ ".*/mysample_fastqc.html" }, + { assert process.out.zip[0][1] ==~ ".*/mysample_fastqc.zip" }, + { assert path(process.out.html[0][1]).text.contains("") }, + + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("sarscov2 single-end [fastq] - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id: 'test', single_end:true ], + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.html.collect { file(it[1]).getName() } + + process.out.zip.collect { file(it[1]).getName() } + + process.out.versions ).match() } + ) + } + } + } diff --git a/modules/nf-core/fastqc/tests/main.nf.test.snap b/modules/nf-core/fastqc/tests/main.nf.test.snap index 636a32ce..5ef5afbd 100644 --- a/modules/nf-core/fastqc/tests/main.nf.test.snap +++ b/modules/nf-core/fastqc/tests/main.nf.test.snap @@ -1,10 +1,20 @@ { + "sarscov2 single-end [fastq] - stub": { + "content": [ + [ + "test.html", + "test.zip", + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ] + ], + "timestamp": "2023-12-29T02:48:05.126117287" + }, "versions": { "content": [ [ "versions.yml:md5,e1cc25ca8af856014824abd842e93978" ] ], - "timestamp": "2023-10-09T23:40:54+0000" + "timestamp": "2023-12-29T02:46:49.507942667" } } \ No newline at end of file diff --git a/modules/nf-core/minimap2/align/environment.yml b/modules/nf-core/minimap2/align/environment.yml new file mode 100644 index 00000000..de1f3811 --- /dev/null +++ b/modules/nf-core/minimap2/align/environment.yml @@ -0,0 +1,8 @@ +name: minimap2_align +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::minimap2=2.24 + - bioconda::samtools=1.18 diff --git a/modules/nf-core/minimap2/align/main.nf b/modules/nf-core/minimap2/align/main.nf index 4da47c18..47cd420c 100644 --- a/modules/nf-core/minimap2/align/main.nf +++ b/modules/nf-core/minimap2/align/main.nf @@ -3,14 +3,14 @@ process MINIMAP2_ALIGN { label 'process_medium' // Note: the versions here need to match the versions used in the mulled container below and minimap2/index - conda "bioconda::minimap2=2.24 bioconda::samtools=1.14" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' : - 'biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' }" + 'https://depot.galaxyproject.org/singularity/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:365b17b986c1a60c1b82c6066a9345f38317b763-0' : + 'biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:365b17b986c1a60c1b82c6066a9345f38317b763-0' }" input: tuple val(meta), path(reads) - path reference + tuple val(meta2), path(reference) val bam_format val cigar_paf_format val cigar_bam diff --git a/modules/nf-core/minimap2/align/meta.yml b/modules/nf-core/minimap2/align/meta.yml index 991b39a0..408522d5 100644 --- a/modules/nf-core/minimap2/align/meta.yml +++ b/modules/nf-core/minimap2/align/meta.yml @@ -25,6 +25,11 @@ input: description: | List of input FASTA or FASTQ files of size 1 and 2 for single-end and paired-end data, respectively. + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test_ref'] - reference: type: file description: | @@ -63,3 +68,8 @@ authors: - "@sofstam" - "@sateeshperi" - "@jfy133" +maintainers: + - "@heuermh" + - "@sofstam" + - "@sateeshperi" + - "@jfy133" diff --git a/modules/nf-core/minimap2/align/tests/main.nf.test b/modules/nf-core/minimap2/align/tests/main.nf.test new file mode 100644 index 00000000..b634468b --- /dev/null +++ b/modules/nf-core/minimap2/align/tests/main.nf.test @@ -0,0 +1,145 @@ +nextflow_process { + + name "Test Process MINIMAP2_ALIGN" + script "../main.nf" + process "MINIMAP2_ALIGN" + + tag "modules" + tag "modules_nfcore" + tag "minimap2" + tag "minimap2/align" + + test("sarscov2 - fastq, fasta, true, false, false") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = true + input[3] = false + input[4] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - [fastq1, fastq2], fasta, true, false, false") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = true + input[3] = false + input[4] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - fastq, [], true, false, false") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + [] + ] + input[2] = true + input[3] = false + input[4] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - fastq, fasta, true, false, false - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + input[1] = [ + [ id:'test_ref' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = true + input[3] = false + input[4] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + +} diff --git a/modules/nf-core/minimap2/align/tests/main.nf.test.snap b/modules/nf-core/minimap2/align/tests/main.nf.test.snap new file mode 100644 index 00000000..a39a1697 --- /dev/null +++ b/modules/nf-core/minimap2/align/tests/main.nf.test.snap @@ -0,0 +1,38 @@ +{ + "sarscov2 - fastq, fasta, true, false, false": { + "content": [ + "test.bam", + [ + "versions.yml:md5,9e9eeae0002d466d580a9d6e0d003eb1" + ] + ], + "timestamp": "2023-12-04T12:07:06.01315354" + }, + "sarscov2 - fastq, fasta, true, false, false - stub": { + "content": [ + "test.bam", + [ + "versions.yml:md5,9e9eeae0002d466d580a9d6e0d003eb1" + ] + ], + "timestamp": "2023-12-04T12:07:24.487175659" + }, + "sarscov2 - [fastq1, fastq2], fasta, true, false, false": { + "content": [ + "test.bam", + [ + "versions.yml:md5,9e9eeae0002d466d580a9d6e0d003eb1" + ] + ], + "timestamp": "2023-12-04T12:07:12.50816279" + }, + "sarscov2 - fastq, [], true, false, false": { + "content": [ + "test.bam", + [ + "versions.yml:md5,9e9eeae0002d466d580a9d6e0d003eb1" + ] + ], + "timestamp": "2023-12-04T12:07:18.414974788" + } +} \ No newline at end of file diff --git a/modules/nf-core/minimap2/align/tests/tags.yml b/modules/nf-core/minimap2/align/tests/tags.yml new file mode 100644 index 00000000..39dba374 --- /dev/null +++ b/modules/nf-core/minimap2/align/tests/tags.yml @@ -0,0 +1,2 @@ +minimap2/align: + - "modules/nf-core/minimap2/align/**" diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf index 1fc387be..70708f33 100644 --- a/modules/nf-core/multiqc/main.nf +++ b/modules/nf-core/multiqc/main.nf @@ -1,10 +1,10 @@ process MULTIQC { label 'process_single' - conda "bioconda::multiqc=1.14" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : - 'biocontainers/multiqc:1.14--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.18--pyhdfd78af_0' : + 'biocontainers/multiqc:1.18--pyhdfd78af_0' }" input: path multiqc_files, stageAs: "?/*" @@ -25,12 +25,14 @@ process MULTIQC { def args = task.ext.args ?: '' def config = multiqc_config ? "--config $multiqc_config" : '' def extra_config = extra_multiqc_config ? "--config $extra_multiqc_config" : '' + def logo = multiqc_logo ? /--cl-config 'custom_logo: "${multiqc_logo}"'/ : '' """ multiqc \\ --force \\ $args \\ $config \\ $extra_config \\ + $logo \\ . cat <<-END_VERSIONS > versions.yml @@ -41,7 +43,7 @@ process MULTIQC { stub: """ - touch multiqc_data + mkdir multiqc_data touch multiqc_plots touch multiqc_report.html diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml index f1aa660e..45a9bc35 100644 --- a/modules/nf-core/multiqc/meta.yml +++ b/modules/nf-core/multiqc/meta.yml @@ -1,4 +1,3 @@ -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json name: multiqc description: Aggregate results from bioinformatics analyses across many samples into a single report keywords: diff --git a/modules/nf-core/multiqc/tests/main.nf.test b/modules/nf-core/multiqc/tests/main.nf.test index c2dad217..d0438eda 100644 --- a/modules/nf-core/multiqc/tests/main.nf.test +++ b/modules/nf-core/multiqc/tests/main.nf.test @@ -7,12 +7,9 @@ nextflow_process { tag "modules_nfcore" tag "multiqc" - test("MULTIQC: FASTQC") { + test("sarscov2 single-end [fastqc]") { when { - params { - outdir = "$outputDir" - } process { """ input[0] = Channel.of([file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz_fastqc_zip'], checkIfExists: true)]) @@ -26,20 +23,17 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert path(process.out.report.get(0)).exists() }, - { assert path(process.out.data.get(0)).exists() }, - { assert path(process.out.versions.get(0)).getText().contains("multiqc") } + { assert process.out.report[0] ==~ ".*/multiqc_report.html" }, + { assert process.out.data[0] ==~ ".*/multiqc_data" }, + { assert snapshot(process.out.versions).match("versions") } ) } } - test("MULTIQC: FASTQC and a config file") { + test("sarscov2 single-end [fastqc] [config]") { when { - params { - outdir = "$outputDir" - } process { """ input[0] = Channel.of([file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz_fastqc_zip'], checkIfExists: true)]) @@ -53,9 +47,35 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert path(process.out.report.get(0)).exists() }, - { assert path(process.out.data.get(0)).exists() }, - { assert path(process.out.versions.get(0)).getText().contains("multiqc") } + { assert process.out.report[0] ==~ ".*/multiqc_report.html" }, + { assert process.out.data[0] ==~ ".*/multiqc_data" }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("sarscov2 single-end [fastqc] - stub") { + + options "-stub" + + when { + process { + """ + input[0] = Channel.of([file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz_fastqc_zip'], checkIfExists: true)]) + input[1] = [] + input[2] = [] + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.report.collect { file(it).getName() } + + process.out.data.collect { file(it).getName() } + + process.out.plots.collect { file(it).getName() } + + process.out.versions ).match() } ) } diff --git a/modules/nf-core/multiqc/tests/main.nf.test.snap b/modules/nf-core/multiqc/tests/main.nf.test.snap new file mode 100644 index 00000000..d087a9df --- /dev/null +++ b/modules/nf-core/multiqc/tests/main.nf.test.snap @@ -0,0 +1,21 @@ +{ + "versions": { + "content": [ + [ + "versions.yml:md5,f81e19ab3a8e2b6f2b5d22078117df71" + ] + ], + "timestamp": "2023-12-30T00:26:14.048089591" + }, + "sarscov2 single-end [fastqc] - stub": { + "content": [ + [ + "multiqc_report.html", + "multiqc_data", + "multiqc_plots", + "versions.yml:md5,f81e19ab3a8e2b6f2b5d22078117df71" + ] + ], + "timestamp": "2023-12-30T00:26:52.963964055" + } +} \ No newline at end of file diff --git a/modules/nf-core/picard/markduplicates/environment.yml b/modules/nf-core/picard/markduplicates/environment.yml new file mode 100644 index 00000000..58b795f5 --- /dev/null +++ b/modules/nf-core/picard/markduplicates/environment.yml @@ -0,0 +1,7 @@ +name: picard_markduplicates +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::picard=3.1.1 diff --git a/modules/nf-core/picard/markduplicates/main.nf b/modules/nf-core/picard/markduplicates/main.nf index facd7efb..80930cc4 100644 --- a/modules/nf-core/picard/markduplicates/main.nf +++ b/modules/nf-core/picard/markduplicates/main.nf @@ -2,10 +2,10 @@ process PICARD_MARKDUPLICATES { tag "$meta.id" label 'process_medium' - conda "bioconda::picard=3.0.0" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/picard:3.0.0--hdfd78af_1' : - 'biocontainers/picard:3.0.0--hdfd78af_1' }" + 'https://depot.galaxyproject.org/singularity/picard:3.1.1--hdfd78af_0' : + 'biocontainers/picard:3.1.1--hdfd78af_0' }" input: tuple val(meta), path(bam) @@ -30,6 +30,9 @@ process PICARD_MARKDUPLICATES { } else { avail_mem = (task.memory.mega*0.8).intValue() } + + if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ picard \\ -Xmx${avail_mem}M \\ @@ -48,6 +51,7 @@ process PICARD_MARKDUPLICATES { stub: def prefix = task.ext.prefix ?: "${meta.id}" + if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" """ touch ${prefix}.bam touch ${prefix}.bam.bai diff --git a/modules/nf-core/picard/markduplicates/meta.yml b/modules/nf-core/picard/markduplicates/meta.yml index f7693d2f..1ab90c07 100644 --- a/modules/nf-core/picard/markduplicates/meta.yml +++ b/modules/nf-core/picard/markduplicates/meta.yml @@ -69,3 +69,7 @@ authors: - "@drpatelh" - "@projectoriented" - "@ramprasadn" +maintainers: + - "@drpatelh" + - "@projectoriented" + - "@ramprasadn" diff --git a/modules/nf-core/picard/markduplicates/tests/main.nf.test b/modules/nf-core/picard/markduplicates/tests/main.nf.test new file mode 100644 index 00000000..b2bba094 --- /dev/null +++ b/modules/nf-core/picard/markduplicates/tests/main.nf.test @@ -0,0 +1,111 @@ +nextflow_process { + + name "Test Process PICARD_MARKDUPLICATES" + script "../main.nf" + process "PICARD_MARKDUPLICATES" + config "./nextflow.config" + tag "modules" + tag "modules_nfcore" + tag "picard" + tag "picard/markduplicates" + + test("sarscov2 - bam, fasta, fai - sorted bam") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true) + ] + input[1] = [ + [ id:'genome' ], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = [ + [ id:'genome' ], + file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + path(process.out.metrics.get(0).get(1)).readLines()[0..2], + process.out.versions + ).match() } + ) + } + } + + test("sarscov2 - bam, fasta, fai - unsorted bam") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_bam'], checkIfExists: true) + ] + input[1] = [ + [ id:'genome' ], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = [ + [ id:'genome' ], + file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + path(process.out.metrics.get(0).get(1)).readLines()[0..2], + process.out.versions + ).match() } + ) + } + } + + test("homo_sapiens - cram, fasta, fai") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_cram'], checkIfExists: true) + ] + input[1] = [ + [ id:'genome' ], + file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = [ + [ id:'genome' ], + file(params.test_data['homo_sapiens']['genome']['genome_fasta_fai'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + path(process.out.metrics.get(0).get(1)).readLines()[0..2], + process.out.versions + ).match() } + ) + } + } + +} diff --git a/modules/nf-core/picard/markduplicates/tests/main.nf.test.snap b/modules/nf-core/picard/markduplicates/tests/main.nf.test.snap new file mode 100644 index 00000000..cd788a4d --- /dev/null +++ b/modules/nf-core/picard/markduplicates/tests/main.nf.test.snap @@ -0,0 +1,44 @@ +{ + "sarscov2 - bam, fasta, fai - unsorted bam": { + "content": [ + "test.marked.bam", + [ + "## htsjdk.samtools.metrics.StringHeader", + "# MarkDuplicates --INPUT test.paired_end.bam --OUTPUT test.marked.bam --METRICS_FILE test.marked.MarkDuplicates.metrics.txt --ASSUME_SORT_ORDER queryname --REFERENCE_SEQUENCE genome.fasta --MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP 50000 --MAX_FILE_HANDLES_FOR_READ_ENDS_MAP 8000 --SORTING_COLLECTION_SIZE_RATIO 0.25 --TAG_DUPLICATE_SET_MEMBERS false --REMOVE_SEQUENCING_DUPLICATES false --TAGGING_POLICY DontTag --CLEAR_DT true --DUPLEX_UMI false --FLOW_MODE false --FLOW_QUALITY_SUM_STRATEGY false --USE_END_IN_UNPAIRED_READS false --USE_UNPAIRED_CLIPPED_END false --UNPAIRED_END_UNCERTAINTY 0 --FLOW_SKIP_FIRST_N_FLOWS 0 --FLOW_Q_IS_KNOWN_END false --FLOW_EFFECTIVE_QUALITY_THRESHOLD 15 --ADD_PG_TAG_TO_READS true --REMOVE_DUPLICATES false --ASSUME_SORTED false --DUPLICATE_SCORING_STRATEGY SUM_OF_BASE_QUALITIES --PROGRAM_RECORD_ID MarkDuplicates --PROGRAM_GROUP_NAME MarkDuplicates --READ_NAME_REGEX --OPTICAL_DUPLICATE_PIXEL_DISTANCE 100 --MAX_OPTICAL_DUPLICATE_SET_SIZE 300000 --VERBOSITY INFO --QUIET false --VALIDATION_STRINGENCY STRICT --COMPRESSION_LEVEL 5 --MAX_RECORDS_IN_RAM 500000 --CREATE_INDEX false --CREATE_MD5_FILE false --help false --version false --showHidden false --USE_JDK_DEFLATER false --USE_JDK_INFLATER false", + "## htsjdk.samtools.metrics.StringHeader" + ], + [ + "versions.yml:md5,b699af51b1956f3810f8a7c066e0ab17" + ] + ], + "timestamp": "2023-11-28T10:50:37.735339781" + }, + "homo_sapiens - cram, fasta, fai": { + "content": [ + "test.marked.bam", + [ + "## htsjdk.samtools.metrics.StringHeader", + "# MarkDuplicates --INPUT test.paired_end.sorted.cram --OUTPUT test.marked.bam --METRICS_FILE test.marked.MarkDuplicates.metrics.txt --ASSUME_SORT_ORDER queryname --REFERENCE_SEQUENCE genome.fasta --MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP 50000 --MAX_FILE_HANDLES_FOR_READ_ENDS_MAP 8000 --SORTING_COLLECTION_SIZE_RATIO 0.25 --TAG_DUPLICATE_SET_MEMBERS false --REMOVE_SEQUENCING_DUPLICATES false --TAGGING_POLICY DontTag --CLEAR_DT true --DUPLEX_UMI false --FLOW_MODE false --FLOW_QUALITY_SUM_STRATEGY false --USE_END_IN_UNPAIRED_READS false --USE_UNPAIRED_CLIPPED_END false --UNPAIRED_END_UNCERTAINTY 0 --FLOW_SKIP_FIRST_N_FLOWS 0 --FLOW_Q_IS_KNOWN_END false --FLOW_EFFECTIVE_QUALITY_THRESHOLD 15 --ADD_PG_TAG_TO_READS true --REMOVE_DUPLICATES false --ASSUME_SORTED false --DUPLICATE_SCORING_STRATEGY SUM_OF_BASE_QUALITIES --PROGRAM_RECORD_ID MarkDuplicates --PROGRAM_GROUP_NAME MarkDuplicates --READ_NAME_REGEX --OPTICAL_DUPLICATE_PIXEL_DISTANCE 100 --MAX_OPTICAL_DUPLICATE_SET_SIZE 300000 --VERBOSITY INFO --QUIET false --VALIDATION_STRINGENCY STRICT --COMPRESSION_LEVEL 5 --MAX_RECORDS_IN_RAM 500000 --CREATE_INDEX false --CREATE_MD5_FILE false --help false --version false --showHidden false --USE_JDK_DEFLATER false --USE_JDK_INFLATER false", + "## htsjdk.samtools.metrics.StringHeader" + ], + [ + "versions.yml:md5,b699af51b1956f3810f8a7c066e0ab17" + ] + ], + "timestamp": "2023-11-28T10:50:48.897954543" + }, + "sarscov2 - bam, fasta, fai - sorted bam": { + "content": [ + "test.marked.bam", + [ + "## htsjdk.samtools.metrics.StringHeader", + "# MarkDuplicates --INPUT test.paired_end.sorted.bam --OUTPUT test.marked.bam --METRICS_FILE test.marked.MarkDuplicates.metrics.txt --ASSUME_SORT_ORDER queryname --REFERENCE_SEQUENCE genome.fasta --MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP 50000 --MAX_FILE_HANDLES_FOR_READ_ENDS_MAP 8000 --SORTING_COLLECTION_SIZE_RATIO 0.25 --TAG_DUPLICATE_SET_MEMBERS false --REMOVE_SEQUENCING_DUPLICATES false --TAGGING_POLICY DontTag --CLEAR_DT true --DUPLEX_UMI false --FLOW_MODE false --FLOW_QUALITY_SUM_STRATEGY false --USE_END_IN_UNPAIRED_READS false --USE_UNPAIRED_CLIPPED_END false --UNPAIRED_END_UNCERTAINTY 0 --FLOW_SKIP_FIRST_N_FLOWS 0 --FLOW_Q_IS_KNOWN_END false --FLOW_EFFECTIVE_QUALITY_THRESHOLD 15 --ADD_PG_TAG_TO_READS true --REMOVE_DUPLICATES false --ASSUME_SORTED false --DUPLICATE_SCORING_STRATEGY SUM_OF_BASE_QUALITIES --PROGRAM_RECORD_ID MarkDuplicates --PROGRAM_GROUP_NAME MarkDuplicates --READ_NAME_REGEX --OPTICAL_DUPLICATE_PIXEL_DISTANCE 100 --MAX_OPTICAL_DUPLICATE_SET_SIZE 300000 --VERBOSITY INFO --QUIET false --VALIDATION_STRINGENCY STRICT --COMPRESSION_LEVEL 5 --MAX_RECORDS_IN_RAM 500000 --CREATE_INDEX false --CREATE_MD5_FILE false --help false --version false --showHidden false --USE_JDK_DEFLATER false --USE_JDK_INFLATER false", + "## htsjdk.samtools.metrics.StringHeader" + ], + [ + "versions.yml:md5,b699af51b1956f3810f8a7c066e0ab17" + ] + ], + "timestamp": "2023-11-28T10:50:26.591387512" + } +} \ No newline at end of file diff --git a/modules/nf-core/picard/markduplicates/tests/nextflow.config b/modules/nf-core/picard/markduplicates/tests/nextflow.config new file mode 100644 index 00000000..02818dd6 --- /dev/null +++ b/modules/nf-core/picard/markduplicates/tests/nextflow.config @@ -0,0 +1,6 @@ +process { + withName: PICARD_MARKDUPLICATES { + ext.prefix = { "${meta.id}.marked" } + ext.args = '--ASSUME_SORT_ORDER queryname' + } +} diff --git a/modules/nf-core/picard/markduplicates/tests/tags.yml b/modules/nf-core/picard/markduplicates/tests/tags.yml new file mode 100644 index 00000000..4f213d62 --- /dev/null +++ b/modules/nf-core/picard/markduplicates/tests/tags.yml @@ -0,0 +1,2 @@ +picard/markduplicates: + - modules/nf-core/picard/markduplicates/** diff --git a/modules/nf-core/samtools/faidx/environment.yml b/modules/nf-core/samtools/faidx/environment.yml new file mode 100644 index 00000000..01ccbcc7 --- /dev/null +++ b/modules/nf-core/samtools/faidx/environment.yml @@ -0,0 +1,7 @@ +name: samtools_faidx +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.18 diff --git a/modules/nf-core/samtools/faidx/main.nf b/modules/nf-core/samtools/faidx/main.nf index c1e8ef3a..d3461627 100644 --- a/modules/nf-core/samtools/faidx/main.nf +++ b/modules/nf-core/samtools/faidx/main.nf @@ -2,10 +2,10 @@ process SAMTOOLS_FAIDX { tag "$fasta" label 'process_single' - conda "bioconda::samtools=1.17" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : - 'biocontainers/samtools:1.17--h00cdaf9_0' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.18--h50ea8bc_1' : + 'biocontainers/samtools:1.18--h50ea8bc_1' }" input: tuple val(meta), path(fasta) @@ -35,8 +35,12 @@ process SAMTOOLS_FAIDX { """ stub: + def match = (task.ext.args =~ /-o(?:utput)?\s(.*)\s?/).findAll() + def fastacmd = match[0] ? "touch ${match[0][1]}" : '' """ + ${fastacmd} touch ${fasta}.fai + cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/samtools/faidx/meta.yml b/modules/nf-core/samtools/faidx/meta.yml index 957b25e5..e189af28 100644 --- a/modules/nf-core/samtools/faidx/meta.yml +++ b/modules/nf-core/samtools/faidx/meta.yml @@ -55,3 +55,7 @@ authors: - "@drpatelh" - "@ewels" - "@phue" +maintainers: + - "@drpatelh" + - "@ewels" + - "@phue" diff --git a/modules/nf-core/samtools/flagstat/environment.yml b/modules/nf-core/samtools/flagstat/environment.yml new file mode 100644 index 00000000..5efae053 --- /dev/null +++ b/modules/nf-core/samtools/flagstat/environment.yml @@ -0,0 +1,7 @@ +name: samtools_flagstat +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.18 diff --git a/modules/nf-core/samtools/flagstat/main.nf b/modules/nf-core/samtools/flagstat/main.nf index eb7e72fc..f1893d7c 100644 --- a/modules/nf-core/samtools/flagstat/main.nf +++ b/modules/nf-core/samtools/flagstat/main.nf @@ -2,10 +2,10 @@ process SAMTOOLS_FLAGSTAT { tag "$meta.id" label 'process_single' - conda "bioconda::samtools=1.17" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : - 'biocontainers/samtools:1.17--h00cdaf9_0' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.18--h50ea8bc_1' : + 'biocontainers/samtools:1.18--h50ea8bc_1' }" input: tuple val(meta), path(bam), path(bai) @@ -32,4 +32,15 @@ process SAMTOOLS_FLAGSTAT { samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') END_VERSIONS """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.flagstat + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ } diff --git a/modules/nf-core/samtools/flagstat/meta.yml b/modules/nf-core/samtools/flagstat/meta.yml index 954225df..97991358 100644 --- a/modules/nf-core/samtools/flagstat/meta.yml +++ b/modules/nf-core/samtools/flagstat/meta.yml @@ -47,3 +47,5 @@ output: pattern: "versions.yml" authors: - "@drpatelh" +maintainers: + - "@drpatelh" diff --git a/modules/nf-core/samtools/flagstat/tests/main.nf.test b/modules/nf-core/samtools/flagstat/tests/main.nf.test new file mode 100644 index 00000000..c8dd8dc9 --- /dev/null +++ b/modules/nf-core/samtools/flagstat/tests/main.nf.test @@ -0,0 +1,36 @@ +nextflow_process { + + name "Test Process SAMTOOLS_FLAGSTAT" + script "../main.nf" + process "SAMTOOLS_FLAGSTAT" + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/flagstat" + + test("BAM") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.flagstat).match() }, + { assert path(process.out.versions.get(0)).getText().contains("samtools") } + ) + } + } +} diff --git a/modules/nf-core/samtools/flagstat/tests/main.nf.test.snap b/modules/nf-core/samtools/flagstat/tests/main.nf.test.snap new file mode 100644 index 00000000..880019f2 --- /dev/null +++ b/modules/nf-core/samtools/flagstat/tests/main.nf.test.snap @@ -0,0 +1,16 @@ +{ + "BAM": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.flagstat:md5,4f7ffd1e6a5e85524d443209ac97d783" + ] + ] + ], + "timestamp": "2023-11-14T15:49:22.577133" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/flagstat/tests/tags.yml b/modules/nf-core/samtools/flagstat/tests/tags.yml new file mode 100644 index 00000000..2d2b7255 --- /dev/null +++ b/modules/nf-core/samtools/flagstat/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/flagstat: + - modules/nf-core/samtools/flagstat/** diff --git a/modules/nf-core/samtools/idxstats/environment.yml b/modules/nf-core/samtools/idxstats/environment.yml new file mode 100644 index 00000000..2401db0f --- /dev/null +++ b/modules/nf-core/samtools/idxstats/environment.yml @@ -0,0 +1,7 @@ +name: samtools_idxstats +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.18 diff --git a/modules/nf-core/samtools/idxstats/main.nf b/modules/nf-core/samtools/idxstats/main.nf index a257d700..00d916bb 100644 --- a/modules/nf-core/samtools/idxstats/main.nf +++ b/modules/nf-core/samtools/idxstats/main.nf @@ -2,10 +2,10 @@ process SAMTOOLS_IDXSTATS { tag "$meta.id" label 'process_single' - conda "bioconda::samtools=1.17" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : - 'biocontainers/samtools:1.17--h00cdaf9_0' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.18--h50ea8bc_1' : + 'biocontainers/samtools:1.18--h50ea8bc_1' }" input: tuple val(meta), path(bam), path(bai) @@ -33,4 +33,16 @@ process SAMTOOLS_IDXSTATS { samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') END_VERSIONS """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch ${prefix}.idxstats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ } diff --git a/modules/nf-core/samtools/idxstats/meta.yml b/modules/nf-core/samtools/idxstats/meta.yml index dda87e1e..344e92a3 100644 --- a/modules/nf-core/samtools/idxstats/meta.yml +++ b/modules/nf-core/samtools/idxstats/meta.yml @@ -48,3 +48,5 @@ output: pattern: "versions.yml" authors: - "@drpatelh" +maintainers: + - "@drpatelh" diff --git a/modules/nf-core/samtools/idxstats/tests/main.nf.test b/modules/nf-core/samtools/idxstats/tests/main.nf.test new file mode 100644 index 00000000..f6c92150 --- /dev/null +++ b/modules/nf-core/samtools/idxstats/tests/main.nf.test @@ -0,0 +1,36 @@ +nextflow_process { + + name "Test Process SAMTOOLS_IDXSTATS" + script "../main.nf" + process "SAMTOOLS_IDXSTATS" + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/idxstats" + + test("BAM") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.idxstats).match() }, + { assert path(process.out.versions.get(0)).getText().contains("samtools") } + ) + } + } +} diff --git a/modules/nf-core/samtools/idxstats/tests/main.nf.test.snap b/modules/nf-core/samtools/idxstats/tests/main.nf.test.snap new file mode 100644 index 00000000..4c6c12bd --- /dev/null +++ b/modules/nf-core/samtools/idxstats/tests/main.nf.test.snap @@ -0,0 +1,16 @@ +{ + "BAM": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.idxstats:md5,df60a8c8d6621100d05178c93fb053a2" + ] + ] + ], + "timestamp": "2023-11-14T15:52:19.875194" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/idxstats/tests/tags.yml b/modules/nf-core/samtools/idxstats/tests/tags.yml new file mode 100644 index 00000000..d3057c61 --- /dev/null +++ b/modules/nf-core/samtools/idxstats/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/idxstats: + - modules/nf-core/samtools/idxstats/** diff --git a/modules/nf-core/samtools/index/environment.yml b/modules/nf-core/samtools/index/environment.yml new file mode 100644 index 00000000..296ed99e --- /dev/null +++ b/modules/nf-core/samtools/index/environment.yml @@ -0,0 +1,7 @@ +name: samtools_index +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.18 diff --git a/modules/nf-core/samtools/index/main.nf b/modules/nf-core/samtools/index/main.nf index 0b20aa4b..8ad18fdc 100644 --- a/modules/nf-core/samtools/index/main.nf +++ b/modules/nf-core/samtools/index/main.nf @@ -2,10 +2,10 @@ process SAMTOOLS_INDEX { tag "$meta.id" label 'process_low' - conda "bioconda::samtools=1.17" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : - 'biocontainers/samtools:1.17--h00cdaf9_0' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.18--h50ea8bc_1' : + 'biocontainers/samtools:1.18--h50ea8bc_1' }" input: tuple val(meta), path(input) diff --git a/modules/nf-core/samtools/index/meta.yml b/modules/nf-core/samtools/index/meta.yml index 8bd2fa6f..01a4ee03 100644 --- a/modules/nf-core/samtools/index/meta.yml +++ b/modules/nf-core/samtools/index/meta.yml @@ -51,3 +51,7 @@ authors: - "@drpatelh" - "@ewels" - "@maxulysse" +maintainers: + - "@drpatelh" + - "@ewels" + - "@maxulysse" diff --git a/modules/nf-core/samtools/index/tests/csi.nextflow.config b/modules/nf-core/samtools/index/tests/csi.nextflow.config new file mode 100644 index 00000000..0ed260ef --- /dev/null +++ b/modules/nf-core/samtools/index/tests/csi.nextflow.config @@ -0,0 +1,7 @@ +process { + + withName: SAMTOOLS_INDEX { + ext.args = '-c' + } + +} diff --git a/modules/nf-core/samtools/index/tests/main.nf.test b/modules/nf-core/samtools/index/tests/main.nf.test new file mode 100644 index 00000000..c76a9169 --- /dev/null +++ b/modules/nf-core/samtools/index/tests/main.nf.test @@ -0,0 +1,87 @@ +nextflow_process { + + name "Test Process SAMTOOLS_INDEX" + script "../main.nf" + process "SAMTOOLS_INDEX" + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/index" + + test("sarscov2 [BAI]") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.bai).match("bai") }, + { assert path(process.out.versions.get(0)).getText().contains("samtools") } + ) + } + } + + test("homo_sapiens [CRAI]") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_recalibrated_sorted_cram'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.crai).match("crai") }, + { assert path(process.out.versions.get(0)).getText().contains("samtools") } + ) + } + } + + test("homo_sapiens [CSI]") { + + config "./csi.nextflow.config" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert path(process.out.csi.get(0).get(1)).exists() }, + { assert path(process.out.versions.get(0)).getText().contains("samtools") } + ) + } + } +} diff --git a/modules/nf-core/samtools/index/tests/main.nf.test.snap b/modules/nf-core/samtools/index/tests/main.nf.test.snap new file mode 100644 index 00000000..b3baee7f --- /dev/null +++ b/modules/nf-core/samtools/index/tests/main.nf.test.snap @@ -0,0 +1,28 @@ +{ + "crai": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.paired_end.recalibrated.sorted.cram.crai:md5,14bc3bd5c89cacc8f4541f9062429029" + ] + ] + ], + "timestamp": "2023-11-15T15:17:37.30801" + }, + "bai": { + "content": [ + [ + [ + { + "id": "test" + }, + "test.paired_end.sorted.bam.bai:md5,704c10dd1326482448ca3073fdebc2f4" + ] + ] + ], + "timestamp": "2023-11-15T15:17:30.869234" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/index/tests/tags.yml b/modules/nf-core/samtools/index/tests/tags.yml new file mode 100644 index 00000000..e0f58a7a --- /dev/null +++ b/modules/nf-core/samtools/index/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/index: + - modules/nf-core/samtools/index/** diff --git a/modules/nf-core/samtools/sort/environment.yml b/modules/nf-core/samtools/sort/environment.yml new file mode 100644 index 00000000..cd50868c --- /dev/null +++ b/modules/nf-core/samtools/sort/environment.yml @@ -0,0 +1,7 @@ +name: samtools_sort +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.18 diff --git a/modules/nf-core/samtools/sort/main.nf b/modules/nf-core/samtools/sort/main.nf index 1e5181d4..4a666d42 100644 --- a/modules/nf-core/samtools/sort/main.nf +++ b/modules/nf-core/samtools/sort/main.nf @@ -2,10 +2,10 @@ process SAMTOOLS_SORT { tag "$meta.id" label 'process_medium' - conda "bioconda::samtools=1.17" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : - 'biocontainers/samtools:1.17--h00cdaf9_0' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.18--h50ea8bc_1' : + 'biocontainers/samtools:1.18--h50ea8bc_1' }" input: tuple val(meta), path(bam) @@ -21,13 +21,11 @@ process SAMTOOLS_SORT { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def sort_memory = (task.memory.mega/task.cpus).intValue() if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" """ samtools sort \\ $args \\ -@ $task.cpus \\ - -m ${sort_memory}M \\ -o ${prefix}.bam \\ -T $prefix \\ $bam diff --git a/modules/nf-core/samtools/sort/meta.yml b/modules/nf-core/samtools/sort/meta.yml index 07328431..2200de72 100644 --- a/modules/nf-core/samtools/sort/meta.yml +++ b/modules/nf-core/samtools/sort/meta.yml @@ -46,3 +46,6 @@ output: authors: - "@drpatelh" - "@ewels" +maintainers: + - "@drpatelh" + - "@ewels" diff --git a/modules/nf-core/samtools/sort/tests/main.nf.test b/modules/nf-core/samtools/sort/tests/main.nf.test new file mode 100644 index 00000000..abb80978 --- /dev/null +++ b/modules/nf-core/samtools/sort/tests/main.nf.test @@ -0,0 +1,73 @@ +nextflow_process { + + name "Test Process SAMTOOLS_SORT" + script "../main.nf" + process "SAMTOOLS_SORT" + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/sort" + + test("test_samtools_sort") { + + config "./nextflow.config" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + [ + file(params.test_data['sarscov2']['illumina']['test_paired_end_bam'], checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("test_samtools_sort_stub") { + + config "./nextflow.config" + options "-stub-run" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + [ + file(params.test_data['sarscov2']['illumina']['test_paired_end_bam'], checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.versions + ).match() } + ) + } + + } + +} diff --git a/modules/nf-core/samtools/sort/tests/main.nf.test.snap b/modules/nf-core/samtools/sort/tests/main.nf.test.snap new file mode 100644 index 00000000..ff722259 --- /dev/null +++ b/modules/nf-core/samtools/sort/tests/main.nf.test.snap @@ -0,0 +1,48 @@ +{ + "test_samtools_sort": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.bam:md5,ea6a0fef94eb534e901f107a05a33a06" + ] + ], + "1": [ + + ], + "2": [ + "versions.yml:md5,33b6a403dc19a0d28e4219ccab0a1d80" + ], + "bam": [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.bam:md5,ea6a0fef94eb534e901f107a05a33a06" + ] + ], + "csi": [ + + ], + "versions": [ + "versions.yml:md5,33b6a403dc19a0d28e4219ccab0a1d80" + ] + } + ], + "timestamp": "2023-12-04T11:11:22.005628301" + }, + "test_samtools_sort_stub": { + "content": [ + "test.sorted.bam", + [ + "versions.yml:md5,33b6a403dc19a0d28e4219ccab0a1d80" + ] + ], + "timestamp": "2023-12-04T17:47:22.314445935" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/sort/tests/nextflow.config b/modules/nf-core/samtools/sort/tests/nextflow.config new file mode 100644 index 00000000..d0f35086 --- /dev/null +++ b/modules/nf-core/samtools/sort/tests/nextflow.config @@ -0,0 +1,7 @@ +process { + + withName: SAMTOOLS_SORT { + ext.prefix = { "${meta.id}.sorted" } + } + +} diff --git a/modules/nf-core/samtools/sort/tests/tags.yml b/modules/nf-core/samtools/sort/tests/tags.yml new file mode 100644 index 00000000..cd63ea20 --- /dev/null +++ b/modules/nf-core/samtools/sort/tests/tags.yml @@ -0,0 +1,3 @@ +samtools/sort: + - modules/nf-core/samtools/sort/** + - tests/modules/nf-core/samtools/sort/** diff --git a/modules/nf-core/samtools/stats/environment.yml b/modules/nf-core/samtools/stats/environment.yml new file mode 100644 index 00000000..b89ce647 --- /dev/null +++ b/modules/nf-core/samtools/stats/environment.yml @@ -0,0 +1,7 @@ +name: samtools_stats +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.18 diff --git a/modules/nf-core/samtools/stats/main.nf b/modules/nf-core/samtools/stats/main.nf index 4a2607de..7539140a 100644 --- a/modules/nf-core/samtools/stats/main.nf +++ b/modules/nf-core/samtools/stats/main.nf @@ -2,10 +2,10 @@ process SAMTOOLS_STATS { tag "$meta.id" label 'process_single' - conda "bioconda::samtools=1.17" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : - 'biocontainers/samtools:1.17--h00cdaf9_0' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.18--h50ea8bc_1' : + 'biocontainers/samtools:1.18--h50ea8bc_1' }" input: tuple val(meta), path(input), path(input_index) diff --git a/modules/nf-core/samtools/stats/meta.yml b/modules/nf-core/samtools/stats/meta.yml index 90e6345f..735ff812 100644 --- a/modules/nf-core/samtools/stats/meta.yml +++ b/modules/nf-core/samtools/stats/meta.yml @@ -57,3 +57,7 @@ authors: - "@drpatelh" - "@FriederikeHanssen" - "@ramprasadn" +maintainers: + - "@drpatelh" + - "@FriederikeHanssen" + - "@ramprasadn" diff --git a/modules/nf-core/samtools/stats/tests/main.nf.test b/modules/nf-core/samtools/stats/tests/main.nf.test new file mode 100644 index 00000000..20c3efe1 --- /dev/null +++ b/modules/nf-core/samtools/stats/tests/main.nf.test @@ -0,0 +1,78 @@ +nextflow_process { + + name "Test Process SAMTOOLS_STATS" + script "../main.nf" + process "SAMTOOLS_STATS" + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/stats" + + test("SAMTOOLS STATS Should run without failures") { + + when { + params { + + outdir = "$outputDir" + } + process { + """ + // define inputs of the process here. + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExists: true) + + ] + input[1] = [[],[]] + """ + + } + } + + then { + assertAll( + {assert process.success}, + {assert snapshot(process.out).match()} + ) + } + + } + + test("SAMTOOLS CRAM Should run without failures") { + + when { + params { + + outdir = "$outputDir" + } + process { + """ + // define inputs of the process here + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_recalibrated_sorted_cram'], checkIfExists: true), + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_recalibrated_sorted_cram_crai'], checkIfExists: true) + + ] + input[1] = [ + [ id:'genome' ], + file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + + + } + + then { + assertAll( + {assert process.success}, + {assert snapshot(process.out).match()} + ) + } + + } + + +} diff --git a/modules/nf-core/samtools/stats/tests/main.nf.test.snap b/modules/nf-core/samtools/stats/tests/main.nf.test.snap new file mode 100644 index 00000000..025c83a5 --- /dev/null +++ b/modules/nf-core/samtools/stats/tests/main.nf.test.snap @@ -0,0 +1,64 @@ +{ + "SAMTOOLS STATS Should run without failures": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,045a48208b1c6f5b8af4347fe31f4def" + ] + ], + "1": [ + "versions.yml:md5,650a365c6635001436008350ae83337c" + ], + "stats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,045a48208b1c6f5b8af4347fe31f4def" + ] + ], + "versions": [ + "versions.yml:md5,650a365c6635001436008350ae83337c" + ] + } + ], + "timestamp": "2023-12-04T11:07:28.26821485" + }, + "SAMTOOLS CRAM Should run without failures": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,dfbfa130d4a6925ddd1931dcd8354a43" + ] + ], + "1": [ + "versions.yml:md5,650a365c6635001436008350ae83337c" + ], + "stats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,dfbfa130d4a6925ddd1931dcd8354a43" + ] + ], + "versions": [ + "versions.yml:md5,650a365c6635001436008350ae83337c" + ] + } + ], + "timestamp": "2023-12-04T11:07:50.356233402" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/stats/tests/tags.yml b/modules/nf-core/samtools/stats/tests/tags.yml new file mode 100644 index 00000000..7c28e30f --- /dev/null +++ b/modules/nf-core/samtools/stats/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/stats: + - modules/nf-core/samtools/stats/** diff --git a/modules/nf-core/samtools/view/environment.yml b/modules/nf-core/samtools/view/environment.yml new file mode 100644 index 00000000..99aa69d0 --- /dev/null +++ b/modules/nf-core/samtools/view/environment.yml @@ -0,0 +1,7 @@ +name: samtools_view +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.18 diff --git a/modules/nf-core/samtools/view/main.nf b/modules/nf-core/samtools/view/main.nf index cb91facf..0b5a2912 100644 --- a/modules/nf-core/samtools/view/main.nf +++ b/modules/nf-core/samtools/view/main.nf @@ -2,10 +2,10 @@ process SAMTOOLS_VIEW { tag "$meta.id" label 'process_low' - conda "bioconda::samtools=1.17" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : - 'biocontainers/samtools:1.17--h00cdaf9_0' }" + 'https://depot.galaxyproject.org/singularity/samtools:1.18--h50ea8bc_1' : + 'biocontainers/samtools:1.18--h50ea8bc_1' }" input: tuple val(meta), path(input), path(index) @@ -53,10 +53,19 @@ process SAMTOOLS_VIEW { """ stub: + def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" + def file_type = args.contains("--output-fmt sam") ? "sam" : + args.contains("--output-fmt bam") ? "bam" : + args.contains("--output-fmt cram") ? "cram" : + input.getExtension() + if ("$input" == "${prefix}.${file_type}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + + def index = args.contains("--write-index") ? "touch ${prefix}.csi" : "" + """ - touch ${prefix}.bam - touch ${prefix}.cram + touch ${prefix}.${file_type} + ${index} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/samtools/view/meta.yml b/modules/nf-core/samtools/view/meta.yml index 3b05450b..3dadafae 100644 --- a/modules/nf-core/samtools/view/meta.yml +++ b/modules/nf-core/samtools/view/meta.yml @@ -82,3 +82,8 @@ authors: - "@joseespinosa" - "@FriederikeHanssen" - "@priyanka-surana" +maintainers: + - "@drpatelh" + - "@joseespinosa" + - "@FriederikeHanssen" + - "@priyanka-surana" diff --git a/modules/nf-core/samtools/view/tests/bam.config b/modules/nf-core/samtools/view/tests/bam.config new file mode 100644 index 00000000..c10d1081 --- /dev/null +++ b/modules/nf-core/samtools/view/tests/bam.config @@ -0,0 +1,3 @@ +process { + ext.args = "--output-fmt bam" +} \ No newline at end of file diff --git a/modules/nf-core/samtools/view/tests/bam_index.config b/modules/nf-core/samtools/view/tests/bam_index.config new file mode 100644 index 00000000..771ae033 --- /dev/null +++ b/modules/nf-core/samtools/view/tests/bam_index.config @@ -0,0 +1,3 @@ +process { + ext.args = "--output-fmt bam --write-index" +} \ No newline at end of file diff --git a/modules/nf-core/samtools/view/tests/main.nf.test b/modules/nf-core/samtools/view/tests/main.nf.test new file mode 100644 index 00000000..89ed3555 --- /dev/null +++ b/modules/nf-core/samtools/view/tests/main.nf.test @@ -0,0 +1,231 @@ +nextflow_process { + + name "Test Process SAMTOOLS_VIEW" + script "../main.nf" + process "SAMTOOLS_VIEW" + + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/view" + + test("sarscov2 - [bam, []], [], []") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_bam'], checkIfExists: true), + [] + ] + input[1] = [[],[]] + input[2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.cram, + process.out.sam, + process.out.bai, + process.out.crai, + process.out.csi, + process.out.versions + ).match() } + ) + } + + } + + test("homo_sapiens - [cram, crai], fasta, []") { + + when { + process { + """ + input[0] = [ + [ id: 'test' ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_cram'], checkIfExists: true), + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_cram_crai'], checkIfExists: true) + ] + input[1] = [ + [ id:'genome' ], + file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.cram[0][1]).name, + process.out.bam, + process.out.sam, + process.out.bai, + process.out.crai, + process.out.csi, + process.out.versions + ).match() } + ) + } + + } + + test("homo_sapiens - [cram, []], fasta, [] - bam output") { + + config "./bam.config" + + when { + process { + """ + input[0] = [ + [ id: 'test' ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_cram'], checkIfExists: true), + [] + ] + input[1] = [ + [ id:'genome' ], + file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.cram, + process.out.sam, + process.out.bai, + process.out.crai, + process.out.csi, + process.out.versions + ).match() } + ) + } + + } + + test("homo_sapiens - [cram, []], fasta, [] - bam & index output") { + + config "./bam_index.config" + + when { + process { + """ + input[0] = [ + [ id: 'test' ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_cram'], checkIfExists: true), + [] + ] + input[1] = [ + [ id:'genome' ], + file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.cram, + process.out.sam, + file(process.out.csi[0][1]).name, + process.out.crai, + process.out.bai, + process.out.versions + ).match() } + ) + } + + } + + test("homo_sapiens - [cram, []], fasta, qname - bam & index output") { + + config "./bam_index.config" + + when { + process { + """ + input[0] = [ + [ id: 'test' ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_cram'], checkIfExists: true), + [] + ] + input[1] = [ + [ id:'genome' ], + file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = Channel.of("testN:2817", "testN:2814").collectFile(name: "readnames.list", newLine: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.cram, + process.out.sam, + file(process.out.csi[0][1]).name, + process.out.crai, + process.out.bai, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - [bam, []], [], [] - stub") { + + options "-stub" + config "./bam_index.config" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_bam'], checkIfExists: true), + [] + ] + input[1] = [[],[]] + input[2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + process.out.cram, + process.out.sam, + file(process.out.csi[0][1]).name, + process.out.crai, + process.out.bai, + process.out.versions + ).match() } + ) + } + + } + +} diff --git a/modules/nf-core/samtools/view/tests/main.nf.test.snap b/modules/nf-core/samtools/view/tests/main.nf.test.snap new file mode 100644 index 00000000..83427491 --- /dev/null +++ b/modules/nf-core/samtools/view/tests/main.nf.test.snap @@ -0,0 +1,140 @@ +{ + "homo_sapiens - [cram, []], fasta, [] - bam output": { + "content": [ + "test.bam", + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + "versions.yml:md5,06b9049228b111e7bed5c52fe8a98d9b" + ] + ], + "timestamp": "2023-12-04T17:41:17.563069206" + }, + "sarscov2 - [bam, []], [], []": { + "content": [ + "test.bam", + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + "versions.yml:md5,06b9049228b111e7bed5c52fe8a98d9b" + ] + ], + "timestamp": "2023-12-04T17:41:03.206994564" + }, + "homo_sapiens - [cram, []], fasta, qname - bam & index output": { + "content": [ + "test.bam", + [ + + ], + [ + + ], + "test.bam.csi", + [ + + ], + [ + + ], + [ + "versions.yml:md5,06b9049228b111e7bed5c52fe8a98d9b" + ] + ], + "timestamp": "2023-12-04T17:44:39.165289759" + }, + "homo_sapiens - [cram, []], fasta, [] - bam & index output": { + "content": [ + "test.bam", + [ + + ], + [ + + ], + "test.bam.csi", + [ + + ], + [ + + ], + [ + "versions.yml:md5,06b9049228b111e7bed5c52fe8a98d9b" + ] + ], + "timestamp": "2023-12-04T17:44:32.25731224" + }, + "sarscov2 - [bam, []], [], [] - stub": { + "content": [ + "test.bam", + [ + + ], + [ + + ], + "test.csi", + [ + + ], + [ + + ], + [ + "versions.yml:md5,06b9049228b111e7bed5c52fe8a98d9b" + ] + ], + "timestamp": "2023-12-04T17:44:45.81037195" + }, + "homo_sapiens - [cram, crai], fasta, []": { + "content": [ + "test.cram", + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + + ], + [ + "versions.yml:md5,06b9049228b111e7bed5c52fe8a98d9b" + ] + ], + "timestamp": "2023-12-04T17:41:10.730011823" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/view/tests/tags.yml b/modules/nf-core/samtools/view/tests/tags.yml new file mode 100644 index 00000000..4fdf1dd1 --- /dev/null +++ b/modules/nf-core/samtools/view/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/view: + - "modules/nf-core/samtools/view/**" diff --git a/modules/nf-core/trimgalore/environment.yml b/modules/nf-core/trimgalore/environment.yml new file mode 100644 index 00000000..6cd0f51b --- /dev/null +++ b/modules/nf-core/trimgalore/environment.yml @@ -0,0 +1,7 @@ +name: trimgalore +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::trim-galore=0.6.7 diff --git a/modules/nf-core/trimgalore/main.nf b/modules/nf-core/trimgalore/main.nf index dcb77ae7..24ead871 100644 --- a/modules/nf-core/trimgalore/main.nf +++ b/modules/nf-core/trimgalore/main.nf @@ -2,7 +2,7 @@ process TRIMGALORE { tag "$meta.id" label 'process_high' - conda "bioconda::trim-galore=0.6.7" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/trim-galore:0.6.7--hdfd78af_0' : 'biocontainers/trim-galore:0.6.7--hdfd78af_0' }" diff --git a/modules/nf-core/trimgalore/meta.yml b/modules/nf-core/trimgalore/meta.yml index f84c4d77..e649088c 100644 --- a/modules/nf-core/trimgalore/meta.yml +++ b/modules/nf-core/trimgalore/meta.yml @@ -62,3 +62,7 @@ authors: - "@drpatelh" - "@ewels" - "@FelixKrueger" +maintainers: + - "@drpatelh" + - "@ewels" + - "@FelixKrueger" diff --git a/modules/nf-core/trimgalore/tests/main.nf.test b/modules/nf-core/trimgalore/tests/main.nf.test new file mode 100644 index 00000000..bc6812cc --- /dev/null +++ b/modules/nf-core/trimgalore/tests/main.nf.test @@ -0,0 +1,105 @@ +nextflow_process { + + name "Test Process TRIMGALORE" + script "../main.nf" + process "TRIMGALORE" + tag "modules" + tag "modules_nfcore" + tag "trimgalore" + + test("test_trimgalore_single_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + def read_lines = ["@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE + { assert path(process.out.reads.get(0).get(1)).linesGzip.contains(read_line) } + } + }, + { report1_lines.each { report1_line -> + { assert path(process.out.log.get(0).get(1)).getText().contains(report1_line) } + } + } + ) + } + } + + test("test_trimgalore_paired_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + def read1_lines = ["@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE + { assert path(process.out.reads.get(0).get(1).get(0)).linesGzip.contains(read1_line) } + } + }, + { read2_lines.each { read2_line -> + { assert path(process.out.reads.get(0).get(1).get(1)).linesGzip.contains(read2_line) } + } + }, + { report1_lines.each { report1_line -> + { assert path(process.out.log.get(0).get(1).get(0)).getText().contains(report1_line) } + } + }, + { report2_lines.each { report2_line -> + { assert path(process.out.log.get(0).get(1).get(1)).getText().contains(report2_line) } + } + } + ) + } + } +} diff --git a/modules/nf-core/trimgalore/tests/main.nf.test.snap b/modules/nf-core/trimgalore/tests/main.nf.test.snap new file mode 100644 index 00000000..84feacca --- /dev/null +++ b/modules/nf-core/trimgalore/tests/main.nf.test.snap @@ -0,0 +1,148 @@ +{ + "test_trimgalore_single_end": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test_trimmed.fq.gz:md5,e0a7516b8ea8d6467d6306acb2cd13c4" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": true + }, + "test.fastq.gz_trimming_report.txt:md5,a1ab3958205f1ddf48af623242b5b429" + ] + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + "versions.yml:md5,47d966cbb31c80eb8f7fe860d55659b7" + ], + "html": [ + + ], + "log": [ + [ + { + "id": "test", + "single_end": true + }, + "test.fastq.gz_trimming_report.txt:md5,a1ab3958205f1ddf48af623242b5b429" + ] + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test_trimmed.fq.gz:md5,e0a7516b8ea8d6467d6306acb2cd13c4" + ] + ], + "unpaired": [ + + ], + "versions": [ + "versions.yml:md5,47d966cbb31c80eb8f7fe860d55659b7" + ], + "zip": [ + + ] + } + ], + "timestamp": "2023-10-17T15:24:57.782141441" + }, + "test_trimgalore_paired_end": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1_val_1.fq.gz:md5,e0a7516b8ea8d6467d6306acb2cd13c4", + "test_2_val_2.fq.gz:md5,f3d61189e6d10202da7b8686f1dbb71b" + ] + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.fastq.gz_trimming_report.txt:md5,315d40465412f9909bbaabf52269274d", + "test_2.fastq.gz_trimming_report.txt:md5,34436303da1c78811103427a2fb57f7b" + ] + ] + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + "versions.yml:md5,47d966cbb31c80eb8f7fe860d55659b7" + ], + "html": [ + + ], + "log": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.fastq.gz_trimming_report.txt:md5,315d40465412f9909bbaabf52269274d", + "test_2.fastq.gz_trimming_report.txt:md5,34436303da1c78811103427a2fb57f7b" + ] + ] + ], + "reads": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1_val_1.fq.gz:md5,e0a7516b8ea8d6467d6306acb2cd13c4", + "test_2_val_2.fq.gz:md5,f3d61189e6d10202da7b8686f1dbb71b" + ] + ] + ], + "unpaired": [ + + ], + "versions": [ + "versions.yml:md5,47d966cbb31c80eb8f7fe860d55659b7" + ], + "zip": [ + + ] + } + ], + "timestamp": "2023-10-17T15:25:08.513589909" + } +} \ No newline at end of file diff --git a/modules/nf-core/trimgalore/tests/tags.yml b/modules/nf-core/trimgalore/tests/tags.yml new file mode 100644 index 00000000..e9937691 --- /dev/null +++ b/modules/nf-core/trimgalore/tests/tags.yml @@ -0,0 +1,2 @@ +trimgalore: + - modules/nf-core/trimgalore/** diff --git a/subworkflows/nf-core/bam_markduplicates_picard/meta.yml b/subworkflows/nf-core/bam_markduplicates_picard/meta.yml index d5e71609..fe63068e 100644 --- a/subworkflows/nf-core/bam_markduplicates_picard/meta.yml +++ b/subworkflows/nf-core/bam_markduplicates_picard/meta.yml @@ -6,14 +6,13 @@ keywords: - bam - sam - cram - -modules: +components: - picard/markduplicates - samtools/index - samtools/stats - samtools/idxstats - samtools/flagstat - + - bam_stats_samtools input: - ch_bam: description: | @@ -59,3 +58,6 @@ output: authors: - "@dmarron" - "@drpatelh" +maintainers: + - "@dmarron" + - "@drpatelh" diff --git a/subworkflows/nf-core/bam_markduplicates_picard/tests/main.nf.test b/subworkflows/nf-core/bam_markduplicates_picard/tests/main.nf.test new file mode 100644 index 00000000..e721f30c --- /dev/null +++ b/subworkflows/nf-core/bam_markduplicates_picard/tests/main.nf.test @@ -0,0 +1,92 @@ +nextflow_workflow { + + name "Test Workflow BAM_MARKDUPLICATES_PICARD" + script "../main.nf" + workflow "BAM_MARKDUPLICATES_PICARD" + + tag "picard" + tag "picard/markduplicates" + tag "subworkflows" + tag "subworkflows_nfcore" + tag "bam_markduplicates_picard" + tag "subworkflows/bam_markduplicates_picard" + tag "subworkflows/bam_stats_samtools" + tag "bam_stats_samtools" + tag "samtools" + tag "samtools/flagstat" + tag "samtools/idxstats" + tag "samtools/index" + tag "samtools/stats" + + test("homo_sapiens - bam") { + + when { + workflow { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true) + ] + input[1] = [ + [ id:'genome' ], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = [ + [ id:'genome' ], + file(params.test_data['sarscov2']['genome']['genome_fasta_fai'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot( + path(workflow.out.bam[0][1]), + path(workflow.out.bai[0][1]), + path(workflow.out.flagstat[0][1]), + path(workflow.out.idxstats[0][1]), + path(workflow.out.stats[0][1]), + ).match("homo_sapiens - bam") }, + { assert path(workflow.out.metrics.get(0).get(1)).getText().contains("97") } + ) + } + } + + test("homo_sapiens - cram") { + + when { + workflow { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_cram'], checkIfExists: true) + ] + input[1] = [ [ id:'genome' ], + file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + input[2] = [ + [ id:'genome' ], + file(params.test_data['homo_sapiens']['genome']['genome_fasta_fai'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot( + path(workflow.out.bam[0][1]), + path(workflow.out.bai[0][1]), + path(workflow.out.flagstat[0][1]), + path(workflow.out.idxstats[0][1]), + path(workflow.out.stats[0][1]), + ).match("homo_sapiens - cram") }, + { assert path(workflow.out.metrics.get(0).get(1)).getText().contains("0.999986") } + ) + } + } + +} diff --git a/subworkflows/nf-core/bam_markduplicates_picard/tests/main.nf.test.snap b/subworkflows/nf-core/bam_markduplicates_picard/tests/main.nf.test.snap new file mode 100644 index 00000000..b1907385 --- /dev/null +++ b/subworkflows/nf-core/bam_markduplicates_picard/tests/main.nf.test.snap @@ -0,0 +1,22 @@ +{ + "homo_sapiens - cram": { + "content": [ + "test.bam:md5,6641dc05efa8384a061f378d86d922cd", + "test.bam.bai:md5,c41c60d8a94adebe53b6df80b6e90d38", + "test.flagstat:md5,93b0ef463df947ede1f42ff60396c34d", + "test.idxstats:md5,e179601fa7b8ebce81ac3765206f6c15", + "test.stats:md5,0035ac8900d85e9a790f4c1f48b76947" + ], + "timestamp": "2023-12-05T17:45:12.484869" + }, + "homo_sapiens - bam": { + "content": [ + "test.bam:md5,3091fe6ba1b7530f382fe40b9fd8f45b", + "test.bam.bai:md5,4d3ae8d013444b55e17aa0149a2ab404", + "test.flagstat:md5,4f7ffd1e6a5e85524d443209ac97d783", + "test.idxstats:md5,df60a8c8d6621100d05178c93fb053a2", + "test.stats:md5,e32e7e49dce1fbe327a89e0fb7bc01b1" + ], + "timestamp": "2023-12-05T17:43:58.582652" + } +} \ No newline at end of file diff --git a/subworkflows/nf-core/bam_markduplicates_picard/tests/tags.yml b/subworkflows/nf-core/bam_markduplicates_picard/tests/tags.yml new file mode 100644 index 00000000..10b85270 --- /dev/null +++ b/subworkflows/nf-core/bam_markduplicates_picard/tests/tags.yml @@ -0,0 +1,2 @@ +subworkflows/bam_markduplicates_picard: + - subworkflows/nf-core/bam_markduplicates_picard/** diff --git a/subworkflows/nf-core/bam_stats_samtools/meta.yml b/subworkflows/nf-core/bam_stats_samtools/meta.yml index b05086bc..809bf736 100644 --- a/subworkflows/nf-core/bam_stats_samtools/meta.yml +++ b/subworkflows/nf-core/bam_stats_samtools/meta.yml @@ -7,7 +7,7 @@ keywords: - bam - sam - cram -modules: +components: - samtools/stats - samtools/idxstats - samtools/flagstat @@ -39,3 +39,5 @@ output: Structure: [ path(versions.yml) ] authors: - "@drpatelh" +maintainers: + - "@drpatelh" diff --git a/subworkflows/nf-core/bam_stats_samtools/tests/main.nf.test b/subworkflows/nf-core/bam_stats_samtools/tests/main.nf.test new file mode 100644 index 00000000..97210890 --- /dev/null +++ b/subworkflows/nf-core/bam_stats_samtools/tests/main.nf.test @@ -0,0 +1,102 @@ +nextflow_workflow { + + name "Test Workflow BAM_STATS_SAMTOOLS" + script "../main.nf" + workflow "BAM_STATS_SAMTOOLS" + tag "subworkflows" + tag "subworkflows_nfcore" + tag "bam_stats_samtools" + tag "subworkflows/bam_stats_samtools" + tag "samtools" + tag "samtools/flagstat" + tag "samtools/idxstats" + tag "samtools/stats" + + test("test_bam_stats_samtools_single_end") { + + when { + params { + outdir = "$outputDir" + } + workflow { + """ + input[0] = [ [ id:'test', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_single_end_sorted_bam_bai'], checkIfExists: true) + ] + input[1] = [ [ id:'genome' ], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot(workflow.out.stats).match("test_bam_stats_samtools_single_end_stats") }, + { assert snapshot(workflow.out.flagstat).match("test_bam_stats_samtools_single_end_flagstats") }, + { assert snapshot(workflow.out.idxstats).match("test_bam_stats_samtools_single_end_idxstats") } + ) + } + } + + test("test_bam_stats_samtools_paired_end") { + + when { + params { + outdir = "$outputDir" + } + workflow { + """ + input[0] = [ [ id:'test', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExists: true) + ] + input[1] = [ [ id:'genome' ], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out.stats).match("test_bam_stats_samtools_paired_end_stats") }, + { assert snapshot(workflow.out.flagstat).match("test_bam_stats_samtools_paired_end_flagstats") }, + { assert snapshot(workflow.out.idxstats).match("test_bam_stats_samtools_paired_end_idxstats") } + ) + } + } + + test("test_bam_stats_samtools_paired_end_cram") { + + when { + params { + outdir = "$outputDir" + } + workflow { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_cram'], checkIfExists: true), + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_sorted_cram_crai'], checkIfExists: true) + ] + input[1] = [ [ id:'genome' ], + file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot(workflow.out.stats).match("test_bam_stats_samtools_paired_end_cram_stats") }, + { assert snapshot(workflow.out.flagstat).match("test_bam_stats_samtools_paired_end_cram_flagstats") }, + { assert snapshot(workflow.out.idxstats).match("test_bam_stats_samtools_paired_end_cram_idxstats") } + ) + } + } + +} diff --git a/subworkflows/nf-core/bam_stats_samtools/tests/main.nf.test.snap b/subworkflows/nf-core/bam_stats_samtools/tests/main.nf.test.snap new file mode 100644 index 00000000..d3af1376 --- /dev/null +++ b/subworkflows/nf-core/bam_stats_samtools/tests/main.nf.test.snap @@ -0,0 +1,128 @@ +{ + "test_bam_stats_samtools_paired_end_cram_flagstats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.flagstat:md5,a53f3d26e2e9851f7d528442bbfe9781" + ] + ] + ], + "timestamp": "2023-11-06T09:31:26.194017574" + }, + "test_bam_stats_samtools_paired_end_stats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.stats:md5,49e2b43344ff92bc4c02463a58f7ba4a" + ] + ] + ], + "timestamp": "2023-12-04T11:07:13.965061942" + }, + "test_bam_stats_samtools_paired_end_flagstats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.flagstat:md5,4f7ffd1e6a5e85524d443209ac97d783" + ] + ] + ], + "timestamp": "2023-11-06T09:31:11.668517251" + }, + "test_bam_stats_samtools_single_end_flagstats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.flagstat:md5,2191911d72575a2358b08b1df64ccb53" + ] + ] + ], + "timestamp": "2023-11-06T09:26:10.340046381" + }, + "test_bam_stats_samtools_paired_end_cram_idxstats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.idxstats:md5,e179601fa7b8ebce81ac3765206f6c15" + ] + ] + ], + "timestamp": "2023-11-06T09:31:26.207052003" + }, + "test_bam_stats_samtools_single_end_stats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.stats:md5,5a6667d97806e5002731e9cf23674fad" + ] + ] + ], + "timestamp": "2023-12-04T11:07:06.676820877" + }, + "test_bam_stats_samtools_paired_end_idxstats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.idxstats:md5,df60a8c8d6621100d05178c93fb053a2" + ] + ] + ], + "timestamp": "2023-11-06T09:31:11.68246157" + }, + "test_bam_stats_samtools_single_end_idxstats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.idxstats:md5,613e048487662c694aa4a2f73ca96a20" + ] + ] + ], + "timestamp": "2023-11-06T09:26:10.349439801" + }, + "test_bam_stats_samtools_paired_end_cram_stats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,2cf2fe93596ee3d74f946097b204a629" + ] + ] + ], + "timestamp": "2023-12-04T11:07:22.30295557" + } +} \ No newline at end of file diff --git a/subworkflows/nf-core/bam_stats_samtools/tests/tags.yml b/subworkflows/nf-core/bam_stats_samtools/tests/tags.yml new file mode 100644 index 00000000..ec2f2d68 --- /dev/null +++ b/subworkflows/nf-core/bam_stats_samtools/tests/tags.yml @@ -0,0 +1,2 @@ +subworkflows/bam_stats_samtools: + - subworkflows/nf-core/bam_stats_samtools/** From 79d9f03d1ca7e1915889368480b48a35d5f6625b Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Sun, 7 Jan 2024 17:35:13 +0000 Subject: [PATCH 25/48] lib update --- lib/WorkflowCircdna.groovy | 47 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/lib/WorkflowCircdna.groovy b/lib/WorkflowCircdna.groovy index cc1128ef..06fd8bbf 100755 --- a/lib/WorkflowCircdna.groovy +++ b/lib/WorkflowCircdna.groovy @@ -11,6 +11,7 @@ class WorkflowCircdna { // Check and validate parameters // public static void initialise(params, log) { + genomeExistsError(params, log) @@ -44,17 +45,59 @@ class WorkflowCircdna { yaml_file_text += "data: |\n" yaml_file_text += "${summary_section}" return yaml_file_text - }// + } + + // + // Generate methods description for MultiQC + // + + public static String toolCitationText(params) { + + // TODO nf-core: Optionally add in-text citation tools to this list. + // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "Tool (Foo et al. 2023)" : "", + // Uncomment function in methodsDescriptionText to render in MultiQC report + def citation_text = [ + "Tools used in the workflow included:", + "FastQC (Andrews 2010),", + "MultiQC (Ewels et al. 2016)", + "." + ].join(' ').trim() + + return citation_text + } + + public static String toolBibliographyText(params) { + + // TODO Optionally add bibliographic entries to this list. + // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "
  • Author (2023) Pub name, Journal, DOI
  • " : "", + // Uncomment function in methodsDescriptionText to render in MultiQC report + def reference_text = [ + "
  • Andrews S, (2010) FastQC, URL: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/).
  • ", + "
  • Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. doi: /10.1093/bioinformatics/btw354
  • " + ].join(' ').trim() + + return reference_text + } - public static String methodsDescriptionText(run_workflow, mqc_methods_yaml) { + public static String methodsDescriptionText(run_workflow, mqc_methods_yaml, params) { // Convert to a named map so can be used as with familar NXF ${workflow} variable syntax in the MultiQC YML file def meta = [:] meta.workflow = run_workflow.toMap() meta["manifest_map"] = run_workflow.manifest.toMap() + // Pipeline DOI meta["doi_text"] = meta.manifest_map.doi ? "(doi: ${meta.manifest_map.doi})" : "" meta["nodoi_text"] = meta.manifest_map.doi ? "": "
  • If available, make sure to update the text to include the Zenodo DOI of version of the pipeline used.
  • " + // Tool references + meta["tool_citations"] = "" + meta["tool_bibliography"] = "" + + // TODO Only uncomment below if logic in toolCitationText/toolBibliographyText has been filled! + //meta["tool_citations"] = toolCitationText(params).replaceAll(", \\.", ".").replaceAll("\\. \\.", ".").replaceAll(", \\.", ".") + //meta["tool_bibliography"] = toolBibliographyText(params) + + def methods_text = mqc_methods_yaml.text def engine = new SimpleTemplateEngine() From 1657a11b2db9c635a0c9ff4877857442acf3c1e1 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Sun, 7 Jan 2024 17:36:44 +0000 Subject: [PATCH 26/48] updated gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index dbc46ce6..fcb4940e 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ testing/ testing* *.pyc null +.vscode From 06f0eb5b6359dbc1340df224ba60266e7f609ffb Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Sun, 7 Jan 2024 17:39:47 +0000 Subject: [PATCH 27/48] fix template change --- assets/nf-core-circdna_logo_light.png | Bin 10330 -> 67721 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/assets/nf-core-circdna_logo_light.png b/assets/nf-core-circdna_logo_light.png index 7146e2817928f2cacd3520fc9c69a675f9c88fa1..1c5283cd9330da6a5e86189e2e24718b5b708287 100644 GIT binary patch literal 67721 zcmeEt`9IX_`~RRQS?ZKSWhn*~p=96c5GGq9OZHNfecuPCQz(&w!1jG0qM))u18{N;szxKLnntC7*Z0~7*=;B1!jv^4p5Gb_^hQ29NgTYTSd@O|5 zS3HI44fR<@BwC_WweNAg^K`t?ay|Ua^`zuS;o*5X;p5j0nLR_3TdTw-*C$<<{Vk$; z9`%au>-b1%=CCl=x~!Jp!Br{RFpzjKp!3X+Tb;*QRKss@Kb){h^c+@seV?p-3zMBT zv9)Zlu({<`v3Pc z_~QTk@G~L)&kz6ShyTBGp!b^mFYH1%8g&}PE+NMRdy{Rgwkaa9QvrRQY2HJz)6`6H z9;J$!8p?T$p0J;N*Ye!J#ykH8M)iUCxVX5E!@pK|Rzc1t45Gxe-2E^GvsRWhY(8G+ zqQw!LH!;zIl^)J$8$X^IcCItbD!;xEnF(K*M&+X@JSfW~(%%?AjAD}I{FvT)!b;+< zT`3RVvHyDV#tr{F?pFSzX|tN{P8k1QHN6RI-9sVD@-lUEm%l0Eg`Uqb{CpIznVgoC zqUmmd=@Irb{U+;BnnF@S4JpEd=f8=bxA|}L4A?vsm9JMY?xEj%PSrz{(B9T6zCrD{ z5aNCa{cB^cli-wq*o{Dpv7Lu_ua|VKlQa68K&C3~Q72#9XybNMzba}b4=Acza~8q2n+%iDoFDn0jDk39X?^7A)!^mJ;E z5ekGVYdquWg)k>J@LX5^<&$Ub>jptvS20#izP!}h(}bdq;~{4o<`Z~-?Z6?eBvmOx zsE#!^me;!Al9p_BB9-oh+Bc@3zYqDCn3hx{MhJ+VI+>dJOaT*E;koA-_dUK}Uzf&# zH;{fF7_10)<{MQM8t=)+Bc#9Hzz?%a`@_R0){SISt$Kn@K8L}>h6mZ|Sq!BZKB@H20kftU}^PiE` z)c*Xdd@3S@t0+sw_uO~aLtzgUG2d;xQ1Q*1H#0qHdV%)wP1#8svyWz%C}A74L_x?B3pf9H&Y@2X=|G$}7iYO?E5Lr+QZ zunjfr@njOx!!AI9VRd9th^kl#?3g$t5Dxfn?H4g>K($Nt+fHaOY#hv@QlJIXl)td!4Cw33#odkl6Y zV>S|OhL=y33;S(CMLA9S@}2)++OhBFrXf0zRg_T_+T~HTPwd7xJV6cPBJX{fB~&hK zs$Fc?B(tfBkrDJu$X3Q1{1zTNRk(@T;z!+JtsYJ#VQFEI95Bp+1d)p+`Gk3TG-5Wg zkhB!>_0%li8!7wS)(5l@KDF!}dm%NoRf{a39g|I_D;7#><0*1`M%3kp01AB_Dq!Zg z8ht}kcgMfVhs)|`f(tl+ixNr3KYnoDKRVH}!H24qCWtT&%xd}zW+opB3MoDNJ0-8f zNvx7d#yy3T+j3B!o%L;!;b>EGDQXB~+h}0EX^k<%)ZBpGVwTz%Bc=Z{6LNVVmQ)Zs z#qHX&f?Rw4S8Pz4H6Vlw2CL`ph1rxV>T3%^&1h1dBkPo8>RjJw|7HE<#P4E!4_OE` zO$@0HI!7pPZx!b@3)8f7f(6Vl`(n8hAxh@*>=H@8QQ)g9oK9SqBFr%3t$}fQ3U0|& zMTUI5{BLzyt1e{`H?CqHGJTzP#T38;zV<;^=nNbG6N-_k!KrUQDx)Z|AC(bG|5a8Z zB*H@M#uON%NKm+sWqkHO`)aB@we3grs9;DMV?Q{%PqLj~`hASTUIF*q`ZO5WR)wVFI`G?Zxevi{$Td5LndKR;aC(U=|9wR~L8w;+zr-%IHsbY> zUgGTk{6DWrVb zYX7qj`>+ae$t5+}$|T_!B3=Erhn`P}k1ai*^PzUqmU{4eDXuat%oMLHRxej$e~5m@ z@ADVp?D3O)y6!#xyXd$s{yrf~zYM$Yrd~^{xM%^*VgG&MleV6Y&|SUNwG!INi~rl; z<-XXdqpn!99)UghSN}nCVm|NOx&~&TmiGceJ?{6R>laTmSZ>pxJbelcMsk4R0F=Ar(?q*%!}BhZw%+9K`8y{Yh!MT%%c;Bib&k(wxLRjmW=N{ro zoje;XgQ^~##P@&C)S#ViS*=Lu%Jg6vf7wA7B1zehn!53h9Ut=hiFVdZ2A1)BWO+Or zT}sR*gJqqhOx-8b1SCR0`&Ue?BhO8gDxoY*R=fY z+Cyn|_k)xr7Y`wB{C-T)JdQ-^IL_#4Kt|xti;{O2Uif`>)vlM+z~WAes&vp2#~e;> zaP#^zhn)Ghwj{nES?XIu)mFnEPiGi7&MHYgMRFdBqLYyRcM0|3NrSwRzt{zDC$Q16 z*lJ*$9KIG@s!K*lv(_p8gm-n5bjuuJKPNIbLluNw9-=Anc+g>>{ftA1)Liqyomg7G z0lZGlRAqUVOzOE5hF~nSdqkDH#ahTn%b<|fSG~?U$lf?xD}R^!j=>M6H8HyWF6y2} zPGPZ%iKNdTp7uW4JWgAQE8vm;X_WJc)Enn#$({*pabQ-s4krlc*`UTUP?m@IrR(4uk6XT&bDN%A5aA~}3fQZ}+Rd6c3 z*IAG-N{$P(j4Q>Srfr2tpV8=0h{!#~3-AoOv!u9tWom_0YBxR+7|^?x3!H1(U)HeMcJvM;GiZDK%TC8~?<`}ApK9*l&Oz?(AV;afU?!7R7^1E3 zn(zjAZ>L6+)k_BZ;z(Js8zvb4U#rVK@}KTN_B?4j^DOxi6XO26e;wx5>Meq@OeH16 zPKhP&D9lsS_dDnqJvA_TPayL?T-&Eo4MaN$Vsh~LOFAw$sP98vj^)e3erB(Ix)0Ed zcRcmT-^mAK97kIoOzJos^3BBIn=oowuyWRsVNp-Q8QI%4?47^vYmBj55kB(7-5G-Jw=*jed)*MV}zlKa?!7quxNI9Dqv5~0*qxF{ z-|ays&_rj1kTx$F^uK@^zBGGr$N8@D5U_4!fjHEh%d}?#HzMqS1VBYf&^KYut?s3z z#x(Dl-G0}fkFA#VYCT#)Cajcq(Xx9}P9Gs}$ynv!cB`zU=s>7GEmrr*<+Gsc;!_6q z1=Fl1&esa#1l?YLx5t#zFs9X%$7g7LW1T&4gw?plYc~G0M)WlGL4fi~%|d=l{ONR0 z(ExtJ#m(uPIko8AUgyCi5<6xC?H?P${GQ>p{S!2bzAysv+#gde=;uWi-SN!d&Z0cl z=Vxa<6L=w~xspnfYZmT}S`g$EU~=c)X2)i+nZgjfLi{{7BR9A9V@M?IiAzae66wR{ zbVBUFuw%J$iY49n2)JM4(tQT$^3x(BBAJp1iSJ3%-4{`4VM1nRNn{A0Wy;eaWAc95 zmX5rTQxA~AmcS{swE)2-o_n~AHzPLsJI(%{&@RtXp}uWD?G!-#W|yZ}HlXQ(*l93tqTy}~zd~*$CAgPi|Hx9G?WY5}M z02i&|#Gzt|tMhtL2iunNy9`lKjcFtdl5U(c0=}qQSucG4Onn{mfpPuC~ zUODq^;@FC~c)^rubE~#vvhN#etKRV16JtlmZIYdM@X)Bpn0CtGAJ@B}v82Whya624 zAWNK=gJR5mxMhoFA9d`R9<}|+y@96bmehO5?J{6J#mA%^uw=C3g0&=Yhgqk{lD6Pl zA2MNCrS_F=zGQJRW^*O@TbhT;+S9Ov8I?CaYg*B%^XJm?+K0UD#yYZ6KNnk=2?@=p zc=mdfEVeY#XB$fMFMFYgxxJ-=GENxkH(mxUP$i=}qjnpYz~jsE$`XWx{Ko z{su~~zYEKQH!jQXa{LphLJz|!xE7Bz&XW0HhkW@%MrHfMT?G}tx!TNXzI;CFJ5KS| z+d?rqica4@b;u}fj(?1w;vxQs=2i$^nPv}O^2q1a?fY1*LTE(|m4YKGJh`lI0QgB5 zLd7Q`gSl>EmtO3M%k!8F{Q_tbt)Q?GgUEKEQ{K}&yDmX?P&-6cwO7Pf5_I02N$U;D z^>}L)h~66K!L}xBeQR1XE4$^_To%#xacxYw<_$IFVFHr~HRaRStq6wUxxh^9K{nwv zGSbBg62eHHrLdO9f=R$peChd;#blkTAnf=uz@z{+E z09mH;dkVd2@B;WHFHWdCk-9TsY`B4HF0mG@Y0w_n%lfxep=Py_`>pF8HAic zI5>Dzt5K|fzC3L9WK7<5F*_$RAK>TKRTAWIyYol#>f`FxkO*AF7vCO4Eh?p$q_x59cLmsMlbT+}V zaI|PtAk*V&lNx5bTV?I&R}u~D-glvDnrJQ!d9;*d={1AV_H|(ab9o^1DGx zEg*8wH=cWZ&jMWl(Bb3=VVJ2CsbSv&R{t)jDfS@mUP+~{)vZwNT@_+ChG}txxpgN5 zoEUkoKQHx6+acPT(tX;P1!#WopOG#Ay=mGdgRh0xa7Yzn`F)du8^WH4JELXyeXy9XZNETOysflQOlCGBF*;iJnGrL6%1H`;Ol5>#tPMvU^qdFg6f+ zJ15{3Uw%mDwl9BEHY@WzC}z+7&<^JkfyR=ThRTwkPyL*}H=xoj`;$p= zzvcr(!zV$+TpgsJOE5~&Iu_a!B5G-Szdsm3JB-9Fv?8G!dg;0Im|<{;?oNIT>Mw_u zc)4N9LGY&l#N!Pr@+CYtT`7<%?rS-11^B9A3X|D zz`k>awRwQ!@Zpjy&@Rq`BKE}8fF_hR1+je_VFF#Pw4WYkP`_+9>`NqEb*gHg1zKK# z9$UEbB;f-%d{2K8i4zlOMLs6c2Alex9lj=y7xD?ln8j|GV)T%Ht{_O8$oT_~^dpxb zh6WP}2HLBBFTy$k4vuWXZp^LOJN}+>so%B{$y?m^&t!i3t`;ZptDkukl%4!I;I-4amD{4_C|db zZO)L6QpS)3z?ueRT_Op~KDooYukNekjPxi;Afr7!vZ@W`8FH7KQEehTFy}6Xhdg}Bj%BxLhz^5<=~ zrJ&XZ1!n?b)vw=MrncjT`pUz!c7_Mm_2vn-!H_(%@uWNm`l$j4BYD3>1G>f&!KDEh zuXthGF+96Nj(Oc46AUNoKh0wc3yq*^&k*k3OQ%^>h~DYB_{L#K11?8(IF=tl4VlX` zMOG$&kXWFZlMd!&o2S^Ck@w$&+a4-RQxde8 zhGZVKLiQTS?|R%5$A%c8!MMTUp3#~rR4ufb%a_T=gv~&9CX$k42Q1}xh5@QxJ5-Se zO<11i9!(6?i7+79&@ktMc#3qHQhSn3jY# zn()HALZ!onAgu|0NiBT3VTe(OOFYa_MqYyO+Igr4F>MH!VT0Sdb_l2_5AA)BkRplz zY67NS#Pi%uH)8<~6fiX}J=utEmR9nJ$b(Slx}(J%bj-eu-&-8ZJ$G2ML6xQA zAn$*S1b*Nrux5H7vK9w{fGcQ-XFC?hb{WqE`jYR|FDtK<7QdrH5269ZQVSZR5JsC% zYD*y4oDl33NA7(pbp}7Lf=ANz3oMdIKMMhB_~RphsVuLXpoz@ncSX`BrMlA2&3=Le zr=R#GVf5O_Xw@XE`ka;gE+ojMDkPy4EYh2}2^PujSTtg^Dwjxl`x8^S*#Bo-a)~MA z>X3;%V(y9P{#itTa%OHjdaY7hm6%u0FA6rueZa!(z z55fR4_!W(|Y)7QOjkW(ASX(RZ05^mIM!wMa#KRYB6NL2nLt0$|L~%@$H13UkWcF=r z`R6Sb*U{lvTj&`WWK&2m$Hbo+Hj_uVHq@qrle~7EG{CIF^po4H9ib5MAw#`nF)#2a zskzw?mkZ`ZT3m&w({4j*Y3f&}v`ym3{rX>ST8FkF4wX+EYy#6Da?BGl^l2ksF*uF_ zSf~FIiseqVB)Xk7I-U)Z3xPLz)#r(2_XdOp+Q|V>M&R-JqC5!o-U^;CyNQJ96Fkol z0ui+IH8F;9L=Cclw!91!P9v0{6Ux$3o=Kw61;|qUDTx1^F2F78u$?LlqwQc#!YOyj z3wao0qG>yrwC#IMe%(Q5{p2e7gCJtkB>*DP;%-TMG&e^bSEfYxsr6E4u8>&@`vA)k zxdcFVEn&Lu2qsQM&ZGW+Xv1=NzHkVxy8(U~=QJ_fFaS@1l%flfx{Z7aNx5?ikptdu z{Iz(pIxZe5Lz~Z)10m7UbOc0FEs_(8Gq;xm5{Y)7VO{DbvU5p+_xE>uE!9gj!Iaau z%TFIXWBQcl8QS$m&d-|+{G1^WoC~bS1nb3WC$J$>;x_+XN(!O`AFjVa!rEXG5`K;b zLkucjdLoFq=2sw)uk#>uh1rhcpfy5-0i{s0rF|25=m!O-h2=Vit8$brH`j`EeQw`? zL6`I+b)0m}!FGYHzOt7qDQX zIS6n~695KoovaVSl!6c;GgU4mm$Y?s0f=D8&_)T~62QOo>)(U|a=<8| zmh<}3Vo5buv9oOvSK7;t4{f@qTbfzW%O{eaBbhLPRl$D5)gGw(des^iu6^*W01VD= zV`SCyCXV!F^g(CP^s5eD;YpQ(DVV+nE2t1WsC?LjMo#~>30v%zN7F=bEEDaTetXht zD1o#E_J1y^GsUSdbxb#c*pR9T1iLgE)cIhl2K;)5od|btFs`W=y+@_Ni2Go$G z@Q{h=CgX5+t#?(wO8mjy&(d?s1W;^(en=qu=JwRZH31Ya4A+#T-}62FOj(4Ize6K}@W6YZr^?Dem#2jOqCXeRmww! zGoXHbb(q>X%pi-d^xzQ?UExb;e0Y9E7+$IvUKF2wG*%JQ^{QuCsPZgsEN-9sivbU` z^o-vqspl3owq}(i0*$Rkr}*|_c^%3<0OR+;sp0(+>IjV)o+Gz$AOr8Yi18q}9&GBb zhCVk~4W$D)%R_z?rKpk>Y~a!^-}tp}xLZErW@WFlQsU52v7F)kHR6QLkLPa`e7PWu zP*($;n`-Gse6jdZF{fFHdOy&oao;`%FPORU1nYRZVCpQF<}Y*}i+P1BV@o7}St8x_r>2-9wNP;M8 zcD9UX^E6p$%+jaBD+&%Za`9O#c7)A0(g;|qKb}NcWL6&jTBlfN|LX0O_N>=8LS}~s zEG>-LxD6U{;Q6zLS7gq*oU)Xj)4UHIuOt8#v3%G9OgVIN1CN5DR`a*hn4WcMhgXDB zET3mhL~RFhA}g0OW>3rX=Z(1R8A>B*u+jHze?P<-rw@NK&kIl&y4o0 z%LA25?zFbbb0q!k(@9RF=!8@GnzM3FN?D7!<#~RA`YxsQ0HN@LgA74Kd!kPf;JS7( z{bOMTc9-*QcbLo2OA#@Kh`ezN@SyqA0S*o(*?$tUfu^W(7FFBZ2>=wKiV0x*H62-`5Fclu*L zA~Ipi-Mq2=6WV6m{YiUEZ;SypCJhiu0!L}LK>g?tkyI=$n*VCQQ_2pQKnKvZ`dcf( zW!^7Wh9_W1bPC5%$)`mLLn%YIqI6mGFsa$VK&*8n>!rELxi1ZUF(i)7X}Hj`zyj*c{HII61u=Y<{rl8{jrhqkAEU5q=%DQdXOIh0xDvYHV8Foh+13dBI$3Yd4~3b%RKPN&QF6obt$IcIBy*HauFFq|vp$<%f`KJ5a8XFyi<8}qXRuV}*ahZQ{g zB#I4Eenr^N1*2yg6?F<4vjkE^Y?n-RvKCWFXJJauev8uSfw0=yUMsh4+Z)tnp0TtN zhyM5PYvE0}LBHz<(y1Rt%#K}6GXFh~JA5SnU z(4kC|If7CaB`fZtoKX}kjSw>H4J{xGWQ8v&vsvc129b3({jj$U9dAK)8^_krX6J!# zIxW_rTP7Mp)wT=zd62oUF0=NxDXnf+`wUUv71&SpDi__ySdKB&|8%(&Ba<$!0N(do?Y0_U~$B}&=QlWP~%Hr~FH$qctY?fm)58_koMPp*h( zJn3j+J$KN@k#?RE6iF6U1l#d{Cx%pb1cTHP~un?rQDjRQ5zSi@)HkbH|YsJFE} z%IdEucy<51w_zb#xgMV1E)d6-W~&UlNK=dTyp9)j12D5bqpWdPHZl%RmduPR=4A;e0bB0cAG9A(?*V0)a!t%S*Pumi8vLLfTp)urZ-phYc`kn znQgB;!M50G<(_T&5zyFZTCoXVP2ukAo;;Y=wPf?8DSysHM5M?H_ zM?Wme+|<<6)Qt}@hB3?{hFEjUbOat=K2*|1U#4c`%Hy{-#+zE$7d#W!Jx0&BJ4!lA zfa!-QG4}*ZK9e$>O|?5TBlv}c?B5%;0m^F+?`B+!rxzE*;;)*`YcRhV4_Pc=nV4M|q$8`7S9o({=o;ipR}!KWvPa>3ogeEH1k6m9Ibd z*&c6fMz6k4v9uNlNMFG7E4_Rd&GH2dKT9!=t9!6PxVA|wDCi6ghLEN0zV&88OHD1q zXW-+DVY*u(O|nr_*!s|ws&Z<�ev`Q}H7y#R1zKkC5n?0_OP7^FqWWeXhX0t0pNK z(bt$TL*ehNPtM(;VA@5R9zN!e8~K<~cX3NnUF1p*`5e(DU1F8lRX-)8KbL`E|L`3V zNx2$Zf1S7Do%}yd%DH81m#>ET4sG1bNkca-B!p$@$27Ju`3?2uL@BKov2V<7mu!_y zZ{zyp_2QITSG-eP=P-{N#gu#(3@bdT4+KZJNda3|h8Nf=HS=!63yn&_8xd=3Jkhf$ z!}BGTsS9Rf-o-Z?Q?|cG3CC|q^rGJn>M0i8LCYqr+E3?cMnhr-$;c_-;y3nImk_jg z*SB>)9>F^Z*<}?lDtFvDC)3w(;J|^ymifdvBjSktDB*-0?<&&u_8~@@7`@G>U0<++ z9+SbA7tkuQpQRryewLjRBRYX|j#Qk}?Z|6*YO7K~og$D#s)y)BWmu8L?D||OjOHli z(rd40>4_~TSlT+@@R3Vwl4m533X}aO_w!RFZu2~QpnL7?*4I%LpD*2+wLVo|@%I8{ zzZ*2>_N_CqtE}T$qqCAa_KGgmtQr5qR1iS0X_i)@emeG`q0wmFbyr~nZu(wbqnm8n zm>_weO@nuHR=8~I#88`0`PS5U9d(wcUZTt7AX?2|`@=qRC83w>Mlt@JqGP!z*B~9k zLWkYhn<%5xrfan)FuTkCh{hk_05N^8n#jP+e{_`}<+~B3W?CiNuAua}a_MTdYyUEu zusJz*oM-`=N*{Piw?l43yLb=$GNYte%b+5I@-V7dC>B1^m zR*$`EP?Yr|V3rCL9eeM`ru`w7D!cmZMv3U8-`dIMVpnov@J7;{b@x9^3m-Z3Y{Z&* zD_zX0=I>)SdOkw+&z36W$kA!;9RD64IRcJ9N)qO^ytsAe+9S#M%>(p0L@&TU7Z<6d zXj3LQe0J3d7TseiYm0wOit-x`{PWm{J|RZs<&$+&Hgo2h z5yoyB+HQt44OJ{z%<^Nov&O3L_s`N7xT*-x6tM{ij1IE&RK^F;>C|9s3ZaVQ%s1ZD z&nS+C*X#c67*TD{>-$e&9F_U?(pP^n73=qY;t~6n@8+=ca8aLp%dr}3!iDJCk?<^K z&vypzO3_=}Gj~EnkD5>38d&H~S$*Q#8lks$jjwQi7#*)n;Y=>q4V;``tYFUD_J8e# zh|!nSX8$YmI;3~P|A88khWk?zH-)?If|Hk_xY3dxFKoZ2t zJhyn*p%TVmg-uCC^US3grB{BCe;gjJc~y-@ArHqhvcIIv>?>x{3Ka?IQMYkLr(_(> zW9Yhih|wXG9m5&4$o+&R?gWb^T_Edb8q`Plm^+Gd%I_1>MvGg_x>l(|hG zXL8v{RZZI(QAKaWHr5s{+1W7^G~V*hY!i97m?+bvfBkF?1U{OvO;CKD`v$kh#Mp6S zW}dnS&g=07uy2cfao?kBg`l52EM{x5^{qZ9WVy(?lQ9ObhGymV&M6W5@vZoDNTGn5;{NXx zX<|J~8H=}B&gYFdI$k|n(j)EUEB-F--tzpx?lX!kjav~2haKue-^}@3(<2`l9v*%V zpct`r=&rGCgdyq>V-|xIQ&eFazpBmQxvNAkeJ+~rNaF6(0Q}arT=aY7^=HiHH|9($ z2FqKi7a4zW5&2$7`1++}teA$yJok{Vzq)`Pmy%Nml3Kg-F zXgU?f+Q^T}S6DR=!9a6CFTM63I1qE;!8>bUFzl|a`*)PGkDYY|aNoPCe2S{MV#&TC z!F=~d-rdNg6D;BHXbe@$z9Ddm+VuDVjk-}hr>I}r58#I@|Hf&`?C6on@5rDQ;BtN* zCm#GK9DZNG)n!xr>vw+e68-Re^a17vyB)GrmOgb32YfBAX7Z}B^qsjdl3ZJRYm~<- zu>14DocgGES;E)15;iXQOAcTgE-RVS%WN{_ViKsrj|B?;TuuS3;|dS!u*jwlru ztBk1E6!us{JY>%V92A6y^0s)NzF5~my5ZE6)b0sJz-@?W8pFoHx$16HHPOny-p6#g{Jl;f&|&AJU;;%xQ`;X{=fW1tN4U72f4 zG2cMw-+5+3LoqX^{p5EUUI>9<26SbY{c>rF%o(YY8`tmLVq6s@K1cKBOl@2}*jRT~ zwnF^kOUr9N0z8a!ueni;qm=x6K}x5od!>a{9A3?Y6I!_mV$%j)A(Y*B&e?@v8S-a( zSs!W+gCwB|RuzEbEPOpaAT+ZfMs4{P_i7&;wmSDNBc#h04lydP z5hC|$bEW#=|eu-u>CWszC&qFp66I!fh(Y*Z8a;X4HJEb(E8rIV;uNI`YuH-0LG z_x|L@M;I=omg$aE(ovAcYk2X;oS)P(zTYR)WiNgO zyKe)d4l{1;mgU^sK2|@v0DmngV>`~z-{GLowF<(4%{)|B5!HIprtr|JB(XfNq)F41 zdBg7zqyK>m2|zW_rj-*ODz_K43Ai6K?;X2D^odN@Trxj!?`>nAs;1XPoBi~&g)}9R z%Mk9FZFTg7bZi1w?Ot=Hz}>6#t^$S6^%~71Rd%7%yXx;S_t zt$ev7PH)oT_RV1JM{E6CffG#%%Bw8`QG6>kQr&(jVIfv&iAif$%O5ydUwiap6W<&v z6Fcmpmhs~C*}t_NH&TIG85T<+5v{-jE2d1K8R0F3_wzj=JtlSsiU1_P;jIu^rVt_$ z12*~{@dWX^EGlooFiB*1lh^f3mtR~?6WXJ5B!8FTMy%2r1aV71x1-&JDdv*D$fk(E zVm%|}?A;~_a#xV!!8snvf{hP7d)bjzB}+edZ+|(zqRkJa54CYhAB$vW9i)=5Jb1Td zsKHz4h5CdIc?r6d&$A<`fhL|44`p0}NYs9xL{5hW#nr+3gyFT9ae7LB7N1huo;yjb z&wqUL-Jo$kkm45a9E#{1v?(hCYS$&-Bp%v6bD5a*gN`dT>3kVm>-w&YhaNy*!&?ij985sS&kCNa*JE8-5_j zl*)Ynf_EvK>~Nl0&OdOB-Lk>%-s?G}==9cy*Z4c0bLjG)or+@Iy6*0Mt>7%jftcqU z_udxaRbCWFgPc{vTfq-3ZDye=9>R0)Bi@CaU_mpj1{f~K9QZafW~F|U&y<^Q)&CHq zFo4D-zr(JPUg2U$d;*Q;!ZuHD4D6}d<7)|w^W(gcEkIi(h^Cp!=CPKa!I7uay&pJ8vY}rHdBkJ~S=vi+eT$}~wv;e%L7}&a*03xDe z641-lqNOI{=)U4uT~qf@4QM{Q=j=M%-eZ{#(dJS=iu^w{4uPI2(A91YbOkq5dnMu^ z15m)6Dz4IgZaQj_0FM0W-{F6{QB$+Ehc;Vmu4mC%2G{h-{o+HBkP?7|AROl^&*XlN zc{98Ncz*GL$dj#;uK8Yn9=-%52mw7idF*<#&aI$(UQuEe&OGOBRZcJaVH|)#IH90w zbu(d01*q~5_r>ReULX$yb~x$fg?8DnBhL)Ur!y5BcXn#3)B#SIPF@jTO#X+%}kW$rp4 z3HUieI@rAoBzq4wsev^5inv}1Sydf6MvtALXt@YrrxxtnRhJqC@h{PQq)%?!|2&PT zpP5>5)3pHS*KMqIO&W(WVY_EfVp{Cxd02)`XoJK9h!XVb@0(q4F2# zJ}mNy&+|Bnmlqv1P4hM{I*^EWBi?`d-6?cN$lB^``8zBA%$r;9tA!NF3I$fVIxVhD(!OdjKfxSyz0@J8@s*BK_WI$@|uGw$m!mVLT+5xsx z{KGk7{QTE}Jx58gK}JV44rH?!|6Sc8AJ)Wgapd0HBQ)FW>n>WJ;vmc9Ex!(h$pqqc z8QU$FAE6>prrggQ0J;1iHDkRVI|CX7z+Xi`kvVmn`a8x4e!nt|yE*#)L1tRH72FwP zy}zc8@yNOTAu%*!f}4v0+e|0--z5ooD6v-%V({(K1kI(3Hm*lpE4|pVS;4rleR&L?aN7Kv{&uC*`91Y|dCsl=N?)>V1R&soy^VyDmb4<38D)!4InyyH&6 z0f16w;%OKKXPivp?+|A&o!mWFCBUZO|8%zX^pC0=yn*wtvWC$=-ao&Z+91td6AYAd z!l-jeHRp2*41eHtPKGkGu>*&tXe0PnR3d5W%~sw)$Ql@8vJhADJi-kl%mUo*d9lT8 zdO|NQ3VcSJDtZcmSOat* zd%gvZvK$-FccrVC9p44n&2AF*>TduE);a!3ZvJ$2;kOrUzvKx9m&SqQ!UN^W&SlX+ z_Hcl^&Kr0c z2vJj0bsAlsEv3mQa4tNe+GnM*KG3D{Q6u-#U4aBKIj{YuYvU4kcx;N)(KzJ_={MjAFuLS?R3PHnijg*CMuZ5>*2TkknWmFH2nAKDBSVjNthgj z441SWzajgc%#wb9c|*XjDC@+^q1o~Vlsx-%@yuDGtMxmaxH4MIRjAOva6YW< zFzABA!sNW}3mFRe+N-*g+!j?W@*&}0ItKAZ)+U!^?=F6e$Ue;R>Y}Z+=M``$sRg*X z9$@rO*o*(H{6N!|M=q5ABL$mP{Yh>C$9-$4KFZ$y)1!4et}IvZ0*zuhK_@)7;<(0tx5Cm_Jqrzhea(H>C6xM|;cjg@1w zuhx7IF^WgVevuFJ96L?gU2apvTk)CZr*?qQ0T>mo@y@AFigJ|DC6+=ZF1>);wJ#Cu zDa?V5@}Slt@1I~fKZ#UZR_hF6Yx$E1Q;krj-qL{*Dcz1rXXlpGW8$14M)cyxf&+86 zb*Tj>$~LRK_QxFY6Hb~b5oSkV5zY@{Jq_yE{tzZJQm%6JAS#yb&kA8{GXB0jbBM@+ zZ-sfD+rX?hr|H;u2ge6bu>%Jfg6}b_?6b%wEAyYV2h7wQtU*A5!NroL-j;1`xMFXl zSIF@ao{GJz(ymN%m&LQ_-=mTq*Y&xolD`)q0IyOuhKmz0DmK-x?U?ez%3%;&B#Y{S zcKR?(;6!&T+oz`g-5p!NRnzvJ6bzS72tE*=SBRT1B(eV_cWQj_)tsbu+pee*w$Jyt zRxwb!*;1R4{axORv&G?Db8yEHS>c3Nrx=?IqPE^|29fmMJMR9n$Ws#wzY1@%hl{Me zuGwB}y&sGyjixIdegma38z|1h&!9G$bc@^0?E2B9rCdj+sHEFr^(c06LKYQpZMio= z76r-X?~#%*%On(P#i*>Itgrc}#_nA)Z+(Sb|M3cE_KU1Bq~yw?3QE%!Ve8I z9KS)gws75Rc>?g|TG-=@N6W~{#?UmcP!q$slAzUy+*sozSkNX+A83(}7TO4(!uk=9 z6Va5j?R6NedEbwrGJ0r_1||=l28w=M_x-k9VG9n6&^?A#^Z4V4!Jvb%UYl;`opV4| z;Z1V^!i5d;YOIR%0~g^wrmm@n+sVsiG`f6x8kvy1M}m&KHhD$QV>bF&@P?OfaBbW* zxC}sWl=Du-BRX~mTduC%3r-Ub)*q5Be2=qg>HmW=_D4LO-pQbvta6x_UG5C>KBJ-hc}&vz zZ?nwzsH)wou7?;C7=js7Y?7NI*=tx=u?=#zFkCg+SJMYG01Dn zo%MX{qLuA=X@pPb$z?@^;@3Ope7MJ1t2@9nbhOCgCt?bRQ_wPD-e}3QosK=x7I`@6u*Y&)f*YmpW*O8rQDj_T- z@}h93a%r@n4-iJLCjaHc3#jMD1SXhc+xbu3*;h{e`x*=6qom#zvWJ(#VRL)Mwh5FD zA0d`5DcpW``T@6y6l!V5ZR^l;J}ey_*!gm4(E^kZCR_v6K-n{-9Et|1+Lt*&ziqBQ$XXl>)uE;ekq^JE{zl2xhx>V^#t*KS+K zP0(&@ExRQ?$zXr$n%Dj#=U@Uz?nRyL=HXx`y4PR$SGem;yYr-~-?)EOog~+FoJ9S! z^}+KTC^n_Om%rQps2kVDz7Uj}>*sq300^hGGECx5S4OgZFRLSaA!}pE*q3yI3#(9Rwg zftY|o_2f243lz7s_IJkF&Y(}!ocZ|lN`{4U@K+-xfF@Axau+YY$CebSMlT85x3iTz6X+C|GlUiRiaRrN50`ZGJoy6g(1VHJP#d@Y%C0_2v zeYdcGU4|6zDE%cm!D{w4ai~PwHdO55>o4ybp>NxXRH^@{QnUNOWCB8!qO7Z$VqlOW zNasf1dlf(7u?<}0-|N+PPrsxK%R}dMt#wXIJ?7yJFwIe&*6ct5cq>Lx?JcV_@!1{5 zxQbJ)?BL5ZN@}2fTBX#POz(p`#V@-&1#e4weCz*<|E{ISg{KUPtp!_k}9@K1@mB7?>dG`_Z5$0R*ozIiaia!mt8GUhq z$~EQA9U*yf>BGuLPvX+Nw}Pz%q-T)V;^sF5ss~VD zy(CckI%aWcUnxOK?KOdRL_cF%NM6DF>OnbFKnx7&sH1Oa-U2g%&U+c!W{%+fc|@ZG zC4(%NFXpT@8&G^Sczd)3|3bNxP89@WTy0DehHRe*kQdMvQ_?#%_3v1zbOlB&+#4n^Bg7TZuyFk@ec%HdtcvOyuuyy_98 z1PLHr`$^>|ztey~!)%SAfT}ZiL3!FB2_vRVRpq1)N5sK|07RG#oIm)D_~ze2iXy3G=N#aGe$H}bppmCMKC15urD zBYDNQzvwY8e425y&2uCm)}6k=6p`>XSWXF~5a^BTO{bq#+6H+A{qeP@6X&}5nAUNN zu#wG1-AjyIyfBOrU-5N3DVgPM z3?=KCa-{Ojnx35U%-EKTxru8&E)k9df36s%fJ!BD+8tlXH;z1b(E6P8j_&lu1UG#3 ziZ8MVA<1mE}kilZE7d-S>a7_8p1orxsQgIJ+HwbBgyuar`a415jpG?foKE=+Qi zH>gOEyM)rngbbfAs~q2F`i1cmdLq)-MqBZ%tTP;?n==}492R#!+*R%jtSj!lOF9w2 zc4kh5HvcqN0Stt3%=2$3O1;sIOWl7K7v-z*1_DR`k4D~9+SBRYjmHZK)JkY*{l&gF zghnKz|6Y#^4qHzZl5Zzv@i{V&%lH{rgsg{nRRMju4Jq}g9vostXa33?lm!U5zCHOo z&cJS+b>H$hWH@>g>YV=g7?GF@ogKeFu0s`Zt~pibL;h%{eQl?}S8J#7HJix_NC^gz zh6GiYtN(!a`*wesFswSDd9&X1Gru=7&HAXRgqd>P$-TWrd_{zh>c>jmOHMD@DY0cY z)O0(8iAw+`u6?|trmC#XT)~0 zqwlp9+cAU$BJC2qb>>T1FQflL6m)rc9u{Mli6NR{^ap(cWgKTpfFc=!WSsg2v~0L8 zi^j_z1#;p=lss3d2tl(sOU;h=K|{vWk=Iycyv^Bs8&VrTM_;t*QGVc2#r)#}RwssE zi!PocnX4lDe;U56iSUWna@tQaj<$co+iO2N=*daUEbNQX=wYq4ga)f>ETQ1O10w} z8$$isCm3D;Kx~$^!0e{l=ZMk*FmFOi^}rucr?(R@7PLJvx@5!maM};SWbp2*(G{UC zxGvTTSP%>q%k~L)+uldo*MzpAy3^^vVl|1Zi~eh``Z_$W1~2#!7afz|c9p3!wdVwr z0HncX!lya*7wIA4Y0j!j#hZ9`wQu)ZQ8BpmH|Raw{9>unZ`((JOkwc;xrNo(Y^r)v z5EMJob?M@XiSsYrw;ZMW8@Lt3JjFhwmDzcIi2bSl;P4WM(i;0@%aEfe72l|3l*g3t zXaWcGr22~jgPPJ1yVEw%Nik-GWC}egHFHN{c5)tBPc^j*)935%%%7D(Jpu1M87GB` z&I$uYmhLO;gA6yCiOeHf^O*7o#%OK! z&qg`>1%9l^TZA1Ee2OBqU7ZSj!5J_01=AJy>agDL+(OK9-}Qd zDy*aLP4MgZ-Rz3YweCfbCSeql3lES(5cYCWckWFWzhGVoqYwS~BK~bQqs!eW5CM8(&Zj zxg=~lFlwE+$wJi8MzmJb=NYb@P4jInnsIGy<4OJ2*xusTj*}|em|{l)$zXzM%O3BA zZ%w^~0q(8Hy0g1X8!kBKPwI(0zIdSh5T#3Y@pGOYS$ed!9@)kB6}eKyI2NO?NGUo7 z!WtM#kV?j@{c8b-;aIZc?g>7~@PhOlPO5q783-N(xeNAs!OdcE;tu}e=tLDg-UBk{ zI5@Qg(P}d12!m$+8oiyKcmk=tJ2>)v_lPLHwby+gCc03JQ;WM-dF*e*x0zrQ6S{Ze zo9p8-bi!*mfVdfN_=c3IAG%+IwC|3idF|u)M%Tux{a75CME{NOZTx&`<7+!`Ea>j2!4}ZP zlt%a*35=!pk0h@>r?=2<*^r{@8OsMv=?PcwSEyA1gy`*fIf>DBB*V{-iX9 zPg!-H-RnV30eQQ97F^viW#E}A)xyx0F7ELxiybA;iq$`UXD+sF>kZW6FYOnG_ zfWim=M^6?Xp_ca8Q)x`&+m&l?e|VP7b~P}*5QtMhss3|lhRPsV_uX5-mG&q<_ak5V zOzV=Jy~O0GH@#s77@x`2m9A1i`S4gY<;dM;Vd4vrsa{DsCC;RF7nXUl+qpUTkb)*7 zKTdq-Qt(#6!uV-!jLr{d62?4(m8O|+E4B#p3qudh6;#Z6G*`>rz2C<+jyK<5^b@NY ztzr1ZzUcyx?Bly>%HWB*Z806YB~q2&HZ9t2Nf#ipwV~trE!Uyw>ZmUa>$BUWI#Mz- z`h^t*u}-8Y!iY(CZ;uPk|ZX(5ZB^t`IQfO-e)uXQ+0C|ztXd8hYu=Z z{bXBWYX|#Z#$E`Z;`a)tSqM!Z-aMoUdxLu!fZuQv}SUI!Pyc%^@K!ES@c~@-~fT&+GK3MR#{`ZMxJe za0)Iq6gxFz+gB9M+au=-MMfLA-)y+lTTM5xv+Pb_+pW8tIja1(7X8F?Rl8CBk8}?v z!^+z$$zE`o+3LuM$v;aoY}R)7l8(fK*Wql_sLA9+;mP zGgs;m|9DZLqWXh9Xtpx(;Z$xE24y~}WmeH%6-5{16sZ|x>M2Igwl?%lrZz0k;69Gd zgr1_kl+wuPHh!e^(oILs{h?AvpGME6Crkyyk z?O7B0&V4b;FxRE3a_M(lhFBP#@RtB1MVA-1#r=$okm)#NX=8I^iBR(n&uj zIhw_cxr9?@#db`v?h#shxK8?lC#~9*Lj1@%p+D1rN2Pji-+#hAhivOqtI4_k(@+QK zRw>iV#zU7}Sab~WQZc2f?G`>IfGiupBzSlBK0cvwDyu|3gKUfGE#k^Amr4!)5#VuR}%HzxIn)&=tSj*{!GC77J9w%G1?x9}J`2UhRs3 z0{zJ|?BbM9JAMP|rF(vMJ$|ezguidRfa>$S3D$1aG^$fYHGOp;%#*G8PT9Gj>5!fJ zD3`@8ok*3LOO{dQ$jNxzOTp36l>D{iClB{p{G0CApGahSTFE~#j$sfU>^Br{uZ$_qsv*vtZZJxC+_{ zsS34kSPtmFKEyNJ6b5k)N#^CL4*_QO(lcl>HwNLUjTR2!qXh{%THEjLc z^?^I+M5_8}#rZEoeLL}Q$xL#Kx=_m`F2mu+u%@sds72m;mknKDg>nk@o6LpH39nUHP!sCv1Tu_@k z%dD)njLcUtIgNdvve}Tt~%S~&z2ldUoj2ACMql5qgn#V{O zKXdZ_lYJ4mzhZhrxX-;zy+3AGw4s@o{8bshtC*ESA$&x5zyG5vDsbj_?$-Ldd}hN3 zCO!oj+nl~*uX4jTfoMvOBRT^1Ahen@@2a=C>SU1fD0{KF*%YyLul(?Dxq!AYikI5A zQ!2rLJC>W)p0BouFKcF<#`0_PeBn@d0&gDwVjA08xW9<><3lzvE4PWqDg|_<{TkZ2+u8gD!dVu7akbNQ+2itVA%5pH;ocR5OtTz5bYBo# zRuEoLTbZS?ch?$Wr=Xn6Ubka3tJLqyp|dX)p8BHfd`16My1}L`WDgPJ-}tEpkp`e~ z2hdTtq~OQ_m9*A!&#H;@@RA_YaC+Bxp4<5K;m3$4;7?zv(pS0^m#<=D_&JxLl1JmE z5YapS=RFUH@u(D!M0ZaQ(dV=UPAu=M zS+a5Wmt}}dl>RAwC+X>iR54RfNn7YbjZb1KFK?V^rwxcV5%UCm;qi|lcQHV5`eIIdyWcuEX|NxMzk5b@IgYakiJr5bGBPu%dt zm6r}GPa1#|BDe&k*mvZosws42DrK! zM*BJzH!Z3klBOQL+SFK8C3jo%LECDTyT8hw$LhvNSfo(|>n;r$yMp9cuiNAwWY{aP zg1zOJtJtOS@zcUfn|y-#W@c`~T8Dl=hf!06=s+#a2VA-jahL30C)zbq$1D+p98~8$ zOFIQ=q9g{0|L!=v{0NRqqjWE@@d-uOsa=#%Q?(zB#`bLByKESn@fVVxhAPQ-{R^9N zTkpF`spJBg`E~qFg>GelrqYop4+ZI{O{d%^5mB}C-x>X9MNp_W=6Tb0uj7BVv+mKP zT(PNV5UgO>Gm_~^!*QH@yo;v zYfIyaWv?o8cuUW5a(H+d=bq))%*NqlEF!f2u)&#Zs`L_?Jc9#C_^RU7ZIz=H#}e)9 zAh|`6Q7NE$QQPdI1$5R4K0b|0A|Le0I$nMg+Xc^}Ym!noE!UMhVD)lV>sbq3C2t?0 z7F+i1F0mPUJbJKct}?VL9EfON&Yrm0YZe$X`qa%|#XN?Jp)wbTTO)5!n6Cxw^kjd# z95jO&3!cPYv?och%QqXD&!(Dxu(`S>V7zp(#xVQ?&e+VsUy)gRlMn<*oopnn=N-^H zdXV3JceP;snrVB1a)Qt?sUY{E#Z%YMN?YZ4zryE(T@xB|abb|$d>5LY#izmucSwlf zmf=C{!Z;?5PlfkSD%)O}>1Vz0`SX1J-h;8baggmI1D zq`*{VlbB})JHOqW#`Xs?;6T^Dv7UZ;qs|Vm1J8;b6t;l}<#eAQ3mJw2@&w!}xu^-l zfdnHa|6NR=o@K^&+ezhM`U7NO?A>N3_U+H}lPOISlUs33QkYdTe?D~v7LHWv z@=%qjy%giJ+V^Vx=2GBfuvQ&9)(n|*Er;oY;h_}~YNQ!xj_UhH_+h%!$WElU90_nx zp6?^|HgWnjHyd0$<7XMaUGvLfkdeM}`;Jre_ z@RwC~HT%CYEP|^IEq(U1eP3F%FsAWXx;Oi6G*=s2#Okfg;v2M8krrMe1z{fk!2NIX zrGLM=m!-UQ-kT8$vd6(h_+npscuAb;-6tp?Z|*P9Z3z!m=GZ&T^5F@O2i&LiZ6v@C z?LqHk+|M)0!#|On;lp%k<*oYbaoI)9S)!^9O0DKzqV?Jl6>1}N3F_0sr=3?{r%OUU9P-p z(lgc*X?xv^CS5WB@I`Z)+Acqlb?N?LG;>?ls>7bWzMOBC=$Lo_)#a)~{xAR^(5SU^UdBP%kEhDthlQ&|rJ$UP)WyN|L zhBc?|7@4Nz%?^c^jyVZaEI1v#Y12T6P*LT1=uL{fU#7LJ_fJ)|bKx)w(P8b5AUOc`~cnUA*?OAp5iI=;!P&v|g~g3Vf(dNKn@=jdpn%yZ@47a9djS?dEsJp~c;$T?w~}V8bCa=8ww>T@D-g zm;8zoo`&^b#)qU-a%cSSnD?Gu2%Q1!Xijrhng6O7CjSk|c`sbX-JO-oTHjZZ_4Iif zq%qv+sJ8EMo84ED^OXwMaA#_kSq>doD2w~7X&dYeLn9RL*DHMHKr46D?YT|hFo{9GSbOCU$c_3fl#;h6Wu{k)LaQ(;qusA>QMOvLn zKhdRc*#?wz;l?6cV)nviBFOV@`@FRV-K!pX>bO-!suumoC;q|9pdrM+U3N|-r#1Mv zxjN9Wn2r02k3v+&!nl~=a!sinq502tOKDHuMsgZSNyWWv5dl5Hi z6{pspRvk(Hqv|!ub*F>fCkNUY3+h+g%*;2m#PZn;#|4&~#U}H(p-g8mHbzbVu*K%} zCDm8N*$lvppuzf~2y{Ma#2F3>Kei z<}Yg!u9u4MG+}VpB5f|HS{RS0NsT7zMv-a8-=8REJwqGzmQSIcvG%rf`oXhyZlx19 zQ_s+Ld9bnUO^jN4KENvf8qj_U3oXG%;-k{9_lHljgQ06jD`=;rHdBt5En``I0q!)P zbxHgGJx2+klL=IKN~mxduQxF1Dbrky6GeSqw2Z_* z_aM~>A3V7cz1$mIJ~%pQ$ye9F$n9~op`Lc`+a_F=y4|>vIaqNDq@=tGTF<%lLKzd@ z`}oo#@oW3vk1aMzk`+{C!+4p@`&mj9{QeJ}BY0t{CK8q)5Pg^~p1<{hj3G`<852Pl zep*mk{YT&~d$Z7vBfHY1e=vXJh%j$fcTza-=3lH+so$$y*wUPvzqz=8>?cFs z<*U2QLFbF3a;}KIEcqJi;daXABYrZU^q=QS{KE&R`C&eN$q$>F?7_9?GMT7k z-V>?Cb>OX6EbTV=sGJ}?qSs>5unV(Ry-z-Xb?#%o^J-_wDPcW-Prp3iCE1#EE~ll+ zH5_}C<50trknp<#wUCyr56<)Tz>PdJw#OsZqEh!wP}I34Q2UwK&Nv4(6>fxSz3Sn;E80Tt;Hm>z|-y9W`7JoXh5Si9Q<>3-Fj0SGl-0GQq6&CLhNvxW- z=ih95pjG-+B@Ry=s38Spyie05ONXv@FOiwf^vu^QE62I*B|f(iXlhT-yj0zfmoj

    )bNtXB<>| z?zw$VG?;}cA_WMLuWxkpU`bqq^-gI`l!vzyJIgmqm5DEFjm;@^zl*oW_s|8wm8e*b zz0XFbT9w}8+|d^`xK_6-vkAYgt=Keh)4pg{f8qatTnp1$c}kL8Q8Mn_uNQo(tIlKi zpX6ZQc^`-|an(4vp*vd)^SNh=Ro#iKRpvBh@*kGgjw6S?q%KHqoeH6(_1wIA`lV^z zAiRs`A3r0$<3C?@`aE7#*py0h!ZV&RT$9)V_a4o83@+F_%Eo_IXpu`p#0RmnkYKV6>PRTk%i$*vH0e2KA$-EIE^&JXaojXAE*53ZKr9x)`Qum z7UB9BUT@5(waVq@friz=*QwcTSIWnOG4BIs|6G-zA;m{oOAc}4!>le3X(;(rUNgef z(7*5!tt5aZn8P0!173!kFHC$!crh8;jTxMQSIE;}csC5F6Vx;H$&(nH3E%(&HAh^MAf}e0nfSMQPOniL_ z7j57+Bi!(wmiNfn2t9a|2C1x>?Ls7;Mf~#%uyxQ4XbR0iiZG~93)7HJPQ|COV0;>D z#;*;}%i>vM=bScHgBHF=!NCGns4A2;tr8_sKh_4a@ zt{B5ZWXgYDXOdJtuC%DBe?Lald9&;{9%iclNek+#CCvfe_-`5NJW@!FZA`&&O&=p9 zUwlVLYHm&ldOFGYwv^64tn!6!H32EqrT>2?b9bz=kKq{R5PdaZBW0#`LK1sQ18{uJjq4Q*}wb*uTa%(>{4%;VK01*KSq zh^qcE(^@tu>pk>REghc5E4ZPCWk%EaO%C z&%%0tbPv5YmqdT&R)}mL3i4XV6jvmR@TXK!7qX{ZJj;Gln!(~06Vc5%7Z>XGw*|CW z{3(&T7JDu_+<_&!Qbi0h)Zwm?Xj;_}Cbifn__LJbIWH-7#rR}P@spEbTfxO^XYW%M zhJEnJEAHE}H`p5>4E?|@|MY1)YOBU;fR@a2X-nTo)!{n3Xe8yyJAvAW=7UAr+^*hFU0;)||N9fTIy zB@~>=9fZueR+b%uo2$%=%7YAE@|9h4K3Gnr3xsLX&S#8Hmt95P4}F2SFI?k!cZE44 z^2&Ay?B%9a<(R{>NER!X`!cultn!S|gQPK!EeGM-a%y_zD!WSZ*gKbs4pw(8pY<-^ zZBJZw0{4iaQ9^ zT8kD}ql$!cJZi)g!$|5ll7vYeP!8VLd+Mk=2qkg8GX(MjA-$f&*W^R5TcrikeH_3g z2RzjTDrfB$SYPI)M3L--)_uH^7i!obxP{DPi zM5t48>!<|&hzBc#kyj=3dbup07F$XBsm!&;-|?ih7;FeG61KWhHgd-0#CxaI2<~64 zohOXU9U8pb+TZb2+zY+0l&eo_^T46u{q~Ue|CxIAMORWHakreaG}#%Q%Wu`*Og7GV zU(<`Cn@pWKnelXBd)xB7O*ED&nM^4DsVG+&`L>C}E7;)|eoNuO5us;xlLaK?UPnWL z9oIsOax`n6NWdBgeD0uZkVvFNYZ%?+(*c2XdpL?3?WayfRx`iGtCGnq$3sx;Vx(au zeMO66%Z|@fLcKSiZ}rdp!ka9fSR9_AmJ&!TPG)LeAcVXh*qv(ZH>Fx_p?Z7S7nWz) z)ey*k3!|#s(e?>@K9M-NqOo)0su5>}F+r^NmaMFtnvw_?(x_3SS5a+IXoVT<|7f5n z-$buLmMlGF3C@o%cq8VqPK?AJsprrN^WyKE4no3s8pPF}Mx72q;$0I|xYfakYG_Gc z357U>Rwm+~cQ?0o5ZVLAvyHORs^qFRX=&JXjNyp<-C>)ib3q~29*v;gHnL2YMhrPvbt=vSuYW4(cr@f z8=UnNlqNf&edfv)#HSxS=HRS5$s<37`H)w=WnJZkdw)=f6Q~4HzGpHu=cCi6ALdP1 zOCr9WAv56gk*@9&ED&R5pq8^O508?s7~M)Fejy@&lnCqs11Ju?5*TNoMVw8rVifFj zD0Up1el31t94lNCfFJZE_M$Bg$??f}Y%#sOy>j30VgauF7cy3Jc`~NLc@mm zb8?LBF*sBh>XCT{wRV0tuIBgEOClz^!hqnpS-}56WzSQ*Z%VqH3wb{?>5ydo4tnPU zxyUu-egF3R#hbM+cj|mFzLvWi^Qho&TOYdh=><&`I1208d#|_`Ht* zfRdAjL*2={gxY5jye5M9Fzx%{!{{ykj`IBreyhrM>4S#a(B$UT4niMF_`CmYdt<}! zv8TF&?0Y&h^K-)qPt6Bqvdv`30^U!{lAW*_lN~5#lp;HEsikw`{me=8=mP$JDi?Wt zpa#P;VlYn}B(4JBW&+~lL7B{A@a#9uw?wkCvgxV=oB4M7kt}3Vvit@|LV5W!K?I|L z;3>H|#C-&2vSf0SPNeU_A;)l4Y=bTzbFMEopMuqayJ>Lz%MeuS)id4_(^6#Vsx^#o zqJb}O-d?j;t$TRbuU`6g@^K<|lER|I)?xgC5t-FXN4tI4sFc_8?ck z_s6pNjh^u1IPD}Zwz6z0QHJgOnmH*Tb6H$7o)*DF6c6r@K!6SodT)WI{mhGGYJ}Iv z!G7g_coQcvliHBmNaKOzCs7eL*ZUIhBH6^Vh1?Ut9Hgq~`^Uy{HQT9hx&FUXSiT-x%ApC;r_aezH z5*`hvJZYm4$ztvx)wS-`9#1_?{hdO*b6x)e;_Sl70nEZD-K&s5e7azHJS6&nIr0Jy z?hX=4@T`nG|L}!jp#>f|MKlg4`HoU`vDo%oI}t>JFDa7b*?2-Xjg7j)tL_sR)!fA4 z23JD&1o4a40%LCb>_Aj+KL-dDo6-q&IyRM3Vtl zU6Y4%0zY5B3a3h_CFR^*rw14cAhz554#zc6UOiEcHj1tR-a)J!uynF>Gtjm(L5vac zkXVJ}Py~5D=3bgQMWH~wV;yehqYQ&q*5boqKlP*5;s z`X$CJ`Am|30f|^+vYK=ms{$_?=mVJC$3(L1Ny~P_IR~dzTaL2&%qKA?v&>rSREbn1 zkzOFc&M>~dF3>-o5p){uFYMDUgU?T*?8t2ujbV>sTsYHiSGuKX-cIu3QDPS6oVyA4EfZW2Xu4$^yXXbD|MOyt_HljBV9W z6`249m?4$_7Z3xlgJsFO8%4&}bYl3;ZyYtwQ0-PxX`kA^+oQ_p*x74by-6~1385-` za4&r=N%(~UHR7s(Dk}VPdPzeDZiiDz89;xt4p`a7Tg6>H)D3wmCj|!yibe7T{AVh; z*4=`{Lh%R{UP?R~u#_Hh;B9SUj(aupz6921>-B58q3%Q7{#bHcIb^a=%!{q|0`7%`CQcJU~7Riz({dUF&@K;~-%)}AK|MpP z6Vq)quNDoPAyEd~Zbr-yWc;Z)i+Ff@&0EFP-0rD^+#qCOLB+7J0{)#VaJAHF?AKT} z(v`Yr>SbyflDqkG5@ggM7A>wpIw7u#q*V7aSJ^-QJIP#+3%@TSRBw}~2Sq{JXiSHN zCvYnL$RPDV$sdq;5H!BCyKVExK{i3sTToWE`yQkVVmeuft0<@iSmwbkZ&W0`8Hq}1 z8pY?Q4kVmBAl-6C3703W%N+{L$2-ptYO!Xr_!s~_mYIKk#TD0f#l(r)50*1O zT~}6fshz-2@bN`%=&ax6Q3Rtco!>Xw+yDk&7V_`#v@)#s*R1XPkO;Kw|0ka~6a zdfJPaG8moV6TDf9k{=LetjpsNUZc}^*~h?omwZo}fmCQuOonx^b(n-}IZ3?t4W_#PZ236ID--qTq5GeclbvmU%r!C#T|19f7bM={LI z<$K@Ay!9H!DU!u7g?@d<%}CWobKJz-j;*zV=OZy49x4J6K894zlL`2^25M^|_z#AL zXRIxR;0&gwh`h+Me|Am;a4OM@*YSZ%LB0eoh2dUNAF~gb%BmMX2lz)ubQF>z&k;|v zXuXMHT#4$qC6F(|-5iTQ5?njvOXssIn6VZBhjT-nLXa_9J10)*#OMc(E~FW4_y!tr zpyow~JQ9{b<=G(42t7}_U*5Jis{Ng*(?eYKObubVVF;gk1;H1)`_hAs*i5FhyV1qL zn_mH!s86VWez=1m?V;$Vt0F!bK8UlrJ+X$$yoR+V$RpVdzGVrSVUrMb0r)I=BJkO% z_;ZL~1d55oZ&JGEJ7*n_=(lfD$}1Lk%(0H%06I0>{Em<8P@p2|9wmtwi94%en3joo zs5BV`Jf6IO|8BL{_3tX)rCp({-nhh}lkUihBo@j<`rW%CNRvD3+-zQN=HxCtvKuP| zNIYrR(!Tx^zCmRB+hK=BhiGvJBknGgf?KLqy8EO(XPvTw#;&~3B2aSu>7@gR1*ApI z0LrjP!rn1=%VhYywzo8Vfkez_K2wE(bANl+7!(j-Sw4~|2#VgPke%2TlsM#>2O zLM}42U(mDn^%}D32eRO)0Fs^#4_|RAO#u$wk7Qv?pvUbXdt{J;J3n6>YPP3zAc%2| zPvr-S$1_O%i!FnFDWk38P|nv@7)5NtM)P?EpeFjkip85!G?Z>Kt`3TKiU>k@Ntcr2 z#P?Bns)Ks){v6ddC*TseBo`@*_fg`m*AQz7*N~vkU=p*%bz-r|l&0E^;EHG2hogJ7 zCu*dN>lLXcfPHZSc%61JbC4yDBXEzmnAxoc&$#U`**7>xwezv8^?kb+LEiUk*vCQ< z7L||Hhfe6z;xo~-EvoBw=Vec1^%8ZRv&%|J+Be~9bP{&_y^J(7RzC_{lIY+z4=tj@ z<}I-`VGYH;h+>$^M(_cWr_3@9AZT<{dA$!Xh+&&#MKY6opZk-mKsA(SpLEx<$y^Cn z4gkx||C00p3n8eH*|2aioZK-IBa-L-fWcVn}SELDwx)Jllb2CHe3m@i&x>cGr9Ixs~!M zOG^|wxxkH`PTJTw$Vx6q7Ax79yy+6I=BgXb-)k6Y82cgezic&j=wqQLOON1tK{+=X zpWj+L2-Kss&cf)H4VjJEQG?~4_z1!Cfu8!z!_~*+8S%dTn}^P&d(*_}T)uaQKEDMB z0M~w`LHBpvNQK~#Louu+Jzk=+1pSQ(JmX9iy~{1i%Eh*0F-nab-tJ2*b{NC1GBZkm z<5WTuPy?R>lK%5c)Rw5S8C1f%69VqqvsTC+|9xOtHLX(Gm(+n1R|+kgDIR!cZe^SRw}7d z;1&em1-gDV6g*@e4JNquZCras|!I3mmu2_8wnNe^b(RX!YgJmR@kpN_+ke zN`AvRg&|j zlt6_`N3vKGh+P?G>H$^=Hk26yRz|@`CzS8?a?UqmvhMU)n#Q*q&hVAJM7=7`g@9pe z89^<=G(sm_Xlz7mRswoTyYz60oQcfIC5`WJn*c#XDC%LR1XncX@lk5zthKr8aWR6g z*hz(MArpKerN|aCl=H|}N;ULiw!VkJdB6UT&f3!vDrVG_N30uZJ*3FGavst7@RE(% zQ3-P_&_?8bq2tAqnG~n{@01>-qa3GMUVkVib@76t>i+aY#M?422j6bHc9ILyvS*B> zQQ;hTorEx+5%Ejntqj?MpK@L-A>*grn3}Xmf~eL9A<3fu@V^M${v%Mb`npo{-kWab zY$g4;waJ-CY5_)}&t6?C)$H8ON*&Z{gA*WkD2AnI$WqGr+dDx4Jha4IECI7ORlX%xLkM2S>PMcfQAoTHXiHgre$Ng``C+UO#Tf z%h)nwFM(vfd1`y)$+e<9#vF(0WB#2seWeOrC8+#Sznrt;aTFq+VHge(W zrLULV-9kwxSkZvb=A>{4q$?@Los{c>y!(<4Z}}x7H_1eA)Vm2%hAVvAq&Gr=X3qss z%ZI$*`HOR832P|h_`UCt@YeCB?vDk`1ijIFpj0~S;5t0+y?on^xUzWvD01NIzw-6X zg!GOMi0ue9#H92NEiey6Cu+B^icR#ZYNp@eiUFO?Nfr7Ruph>k>z8L==o+C44y|SzJlM0I*>xbKB8ipr}PC$Vq1>q1lcQUVmYSy6QkL>A*e-!H* zE^(h_rDTROBbAFN7eq_a_1wd0CwYNzI#a@`n-!AuwhhFxQXr+>8N&+;k^;lb@8IM0MP++-^ot&?qrdT% z@mt^g{?3Z;HrZm^T9}sx)ecIrLxK@CD-D*|m9|IDBSIvWPqVHyJ{kM@xVB3677f>}YM!uoen+4Oz@ixxU4lLhmdnA5_Cq zn!eQCP6VBdu#5-q++!n15F&4}luzs{UuR55zOLgFrsna*>NC!J?Cp@C$r2nxuAoQ6_@4>i!6BY@q3nq~DerN>eBtm6*u#Q`uY>m(|fJDWc zpd*|pqn5K+7*%^nTL*KYS_V1t6%vq`ecJ&{84B}oF zCzG?le%RKJAo5Za*j|fNy}S>y9=!0XA^r$uwZD_MT)i18>}k80A($6~-0{+6T>DhH z))3w`G*u{EYE@%Bnl`c);H`-I_l(mxT>~H9CT$R>H^+UeV*&En!Rqu z{b+UcK~w&8PUYTj?1*4Qo4e_xVehcV!aJ`ri#6`$VfW$Z)xp#{#z~hsQAf`=ZCNL{JQMT4Pss0(=nZcMfFg6F79R(b&tT1 zA~R(|O243sb%AyG9^}`bKkgKq*>=nPf)x~SUzz6ij(RZ7+V`Tx0@d|mcE1L^^tM(30<+-Ybq|(J5AS4>HfrK@Y`q@59{K__?e~yDbZ00uR4!EC zK}u!5t72Q@REmf9ef}1&kj+`|1rPau?7e4LQ)|;U96-c^pj)L#(@g*iO_1JnTOiUb zM0yhhrArH)pj%K8(4h39fFeP9?@?3?RX}=|07?r4>Fu2b?t98I&N*k2KxjXcM@vFOp?}r*ww!X|C7V!TO%^5~lQ~G}nOTVKudFPpn-)eHntlvC zhRDD;c#xzEuvuo`&^oIfC@wke2%j0G7T4i7*a4HxQ<>dQ#VXxfE13F=UMCC$4-_#f zdn_uc`g>F>m0KoGu1NA0I&Itws_#p;$fciT^|638R1B9^$4qjQ69FB)TIu#?acbG$ zV+HK&wf)mh|Qn@ST zdbX8j$({Mf^Htw#_bs0r=y#?534GJ{v(r8Fd?gD0^{O(I^1)YCp4F#V*Wyclo*(7q z?32PTlWLHoM9&01sPdma1n@GRDejWg4;LM?f&LktV<#SN;jU{?E<-83EJ=LP+7vqd zZ2V8L+7bO2*MQ(;!*54AP-0X*5uor!E|vg8l8Q2AV0i==2_sGL_PqzC8K}XbPY;RG zV#X>-r&fwW0G)UZ8Vnk)uOhoRzZ+_@xq#|`|1K(1Od)?OTn*#$hp!+g`)zyp@|e3r|Zlk33&C9GuZM_MX=FcNwF&q=kO9TS?Ru z3G7Xg-iG;mz}!bc-P%%}PF}8II_#cT{qp@^{Ve~%7E)dpu-O!Xx~2d)4?0jabcsy7 ze0}_Fot&h5(2(^3wiYrE;ls$cVZB$3q`T`qH*nwrTtHujtu7im;xwOxXs>=^TDM4C zcqRX*jrr!r(ZOb&9`n$b{_BenS1QIf+4s%PdOzk@@tZpdeY9AUwaw|GA#<`Lcgd}~ z2;ov&_Q#2vEB#zm(k14b+7RAZ)1)tk(3_s+{~c!bd{epkeOV_9NYYHw3t)h0#`-bu zn3SLJvB&U4Ua?=8eev*G@OHHz#K8AH!1m`kU_F@P4)>it8uU@%obOIQcT0a@&DV;c z5Gs3QV|H_nWx8nn?j@sxeSJ)mu>*+Ynrqzs~OpjXXTKhM;s@Y9$%)XsA zqxI3@16LhNXVb%RO{?{Ni`>f)m1-`@G|*RNMci9rJO>2oo(O>cWw~eI9lM#^4xYO< z0k6Hk8X?Gq*B)BTiqWUin~a?{xuyUna#}r{{aOWa!|}dLN-1Zue$8WHAn!}TR|}r1 zGEKfJIb)k{kg@J#Q@6Ib=3IT6x;(Pe4>uhb(loI6>Qut7o>e<#v-fMCT!L2S?HgCx zMzS1B*TyDVwfpAM`DHZhea}WPZf|*pirtB{*DDPEV^()^|5!pfgohVyM?zI5HDbscmbKdvYj!qu z_i!0KPW5H&_Nc)bPgecKk3nyW)|-{9N@kx97-c)so|dTNm%ZndL1ovUFKTAQtj$+U zIEwjIEC*F}NBL#-dXS2!6^o=akqr62+V}k@KpX)81|s@jp8q%>-K=zt2SkQy8A>hY z$%B2#pJV8eKpm8H9^D2FY`)qSOQ{(-qJ>0CJJQEI8!o;79i3=M=u;gokTlYv-)F#2 zW|Nj+u{5HD5{{Cz9zQV0>)df}ZY;xlX>FN{ZT<6_CezyI^0WtYV-4z~jq$jF2$5c< z>x=eX5^e)*Z=@Z&K8&V`T(|Z26;oNmM)zHDP#j$GmJy5T{xI}@)7EV;knf+^&;Yi# z+ruqFa3IO+)3QlLx7ra#NYv|2F)RPf0#I42efQ~cz(%@uD8fLFiR;6Zm7lB;8>eRk zV%(lTJJb3P1U`QBPWqoPTatUXE}~?h3^%v2_0O7(g{xL;1|)^?89ei-7bb!xVXNup7j%N5S{5e`8MExgidb4#r=`LFxihA`1FyYIG)= zd0IP`Zks%JIQTSQ1i-#*dJ;p*@3t|q1Znn#2gqEjAYg3@AZwK$H)T=eOZ?UrdYDv~ z7CLLd$HSi{M@y;?NWJdMGCLP!0HpphDH?$@DgbAumMUHyvc~=^-#*9~qvgjp4|wco z`@AP@9?Bc^Oc3eu54N^iHyuM?QQ*BxFJ5B+=bFFWriw~rZhRIv0-5b%joJ~I^f6)K-dV0tdNGDEj4AXn zprs6wE+oV@0+(M32clVNm#t2}&0EMqD>a&HbZzbj#hk+mPl1|MM=t&(HL-vN2+SNG zR3eSSBWo@VW{~)PZLSI2xk0%Y4Yb@L5AL>^j58;X6!i&-)uth_CjS_8@zby{>J_^^ z(IREEN&l^c#E(}`_CAv=%@BzSYYe85LB40~w_LAhj9iq9W*^Zf@dt=qSiF71_qom2 zF&I}^u`$7%2PN+rY$kCzbsD_*%xnJaHTmYhxr$_@3F&|YXV7JMUu@*B35f9ZnB z^K{TE%glwr&p&vlUT`n54=%zF?wUN8xxgf)3AlXp;UB^ezA z+womd#Zg+w7~~!__qd z3n8H#XTe|*L~GL|qSE4ItZ-Udc~4ky%I9q~aOCcKa-|8Y;uD)XGHnp1nDz{kNqnOh zJ%NfhIT{B`In)Q!>~;9|W_?Jej!)YVlds8pUfr;TnUee7=sS^6@Ck_3sxZitdQf71 zXF4)STmdQ&Gwy;a6@WxePPT%{4wtmRs2B>u)`U2zgCk3r!X!_3;}!>#CIEFe_(;uM z(wfEKRnlsJqHKCdCR0C(8{!aRZ$Cd1hQOsEE8d6Ctnz2}`NJtHg?A6JNiz(Dd$gW> zZ7uKFx8vZjV2~xG>VP~5gP53v0#X$!jFA_NQ3ZBn=+tT_cVMgA%7wQDhm!iRiMZ<= z(z2c!g;_O-qem*CDx^1oIRH6QFkY8I_4AF8cPEIN0-IWHhu+6@a@)7@9~dCy^+IyLE@gJ9q}{>22ppdutx26Elt^DH!=+Tx6~T z;+;XhiOv(#N_l3|GV_ih6R_Yvn)tNBp46ZCn!Fm29FR({O&9t;f^igB2wYBE&(&$KSXe+&$?Bpu^JLFey~tN_dp;;!0TJWu&Kqb7F8BQuVJ%du_vcv`5>_uRhz z13coEGBDlH>(#x8yJ^{Il6a1~lMN`M=6%SJAN%^=sg8NaO=xgb>aeG1y(v99_&L1K zkjW>&=F8Y)h-Et$Z?paqtM#K>0@j+->5my#;@-Sf)xwa6QaV*tiAzMm8NAYl-0S|DOKK*x80e$Ci5=*)M~ zL=G;01DG+}s;|&f(5o3`V7k68my}rkd zvs%PAPjzH)a4-A*Yrg1k<_?xUTAUv(=90>l<^Q^tw?*x4KAwJhsh14(j4m(%+rg$q z_$TW*?l0@rDq+2&HB290(6tpNl)V)_+P8B}dvinD3`SkJhFi|Sh@%{SHOQY=z2Ju5 z>4plmGPdGpt3lqG4DF>OmkELCDt5ld+h&^hR~J>U>@$;@wbjs?=dfFSmCzH=#xh?I zt=P|RLtoiHb_7)kLF|i3ZJlUj^kbQ1@B1_4ts%Y|f^xC~dbS*v@|)MlUjpf!miT~vmLK9<9g51Vg(9bMP$;5#eDTr|HhGqtgN z)~x?&t6-Y6VaCsaP0KQWlLPG>La_lFGzj4Zs$@J zVAJW<69D33f>fz`;@Rh5 z;AjXjX`&ly87(So4)&XcH&sdn~I zn`Qn*yn4eoPGR7+WMee0l_I8}j;0!(gQ%~{J$({`MJp;Mq;*q;>7jsXg=OhY`&Iu) zZ)Tecq{ZHFWI=zQ547>FCHw9MXxEj^g3JfDtZyf7n1x+o$56)y(3qG5>mu@oL9WwJ zGMb-?gs^aYWprSBo*!BH2hY)vw=&vKLfBz%(;Xir;%)|0PVv49_L(ZPAELHnGLKYQe~HLUFj zEdHmIp4jBTx3>X8r zg(I9D47SLiNH(XQ{Ap?4zGHWoe(x1zxbwH=9eO8sWWW!=_ZfLGq*OfXMVuP{;5^v6 z3^aYZPj5RRBJmNC8f2K2d5~@JF+7)ljDe&BttB&4x_)Tp-x>R_h4Yaq+hxzImt@qA zoQ9=r1JsihQUXg4dff3hj7W*ub|GYIDF*;g1&~~}hBcRNgKa#U?}r@r%0w0#VY=|W z@isu*y104vNX&fiCbZe0O^6P8mmUD)7Ab+A6CXR55M1D%g<)O!&w*$m{acS)a3F8+ zdy(riALZP|`#+q1dW3R{#03pl&9E&B1X&v$639*m6*Z92e^6uyyNxn9@B;ZEN_ov!}v-_PJ7I*^Z15XQ`1S7gkq7H8(|8hM%3973TwCO8IQE zdy8Ff+9{S^Yh98>{8hTjph{PR5@TfHyfnDWVBeag)6mCGj{`jXCj$TV zE2#@GuSu5(^XQ}zC`m}=|y3*#WY3ENjI$qXtZdo2R zYYDpxeaz)NOG{udqdzz>IsauQ$AGOA8!QHUSA+?XQ{cyU_OG9NUiIq8yRV`4sIAD+ zrBR-PwCSPIfD(>*R3vxv2Fj66G?s%a@p0%g9oGMH1xT6P`+T76MaeJ#2A3#3Pgqcc zu8=e!-FrVGVoEqYMnQ@vd>tSlJ4~-NGA_jaEE+hgi>kw?gL5StqT66z?w_-iIt$}j z3|nI!y3qK253#bF3uw{XgOa3k?@zehRhmhG2r-|xthLpkp1@bp=XUv+O6BidLeJ#D zueg#e&~1_buGogWbO5D7H+m6}n>>Qt*ucs1i&m(ssX1hncb=WpAdmsZJObaeh1O=7@SZG5E(n@^7us7L| z{vAq-cQm`H5QGbW0tmc!sH5)$UOhk?r3-<@k!AYXH}QYj~pM?rs z3S>Le3U_bloL6aj=)jqDSSQw3`a%#G^L>2Bb$dgb@1GMJNFzG2KFERK4kn~1{Z|r` zf+4dyeAopX^I$6&j2IY<)ph71JQ$D45@;65n=~LcC=XRD_dw_g#=GDp0~p`SmAM>* zPxzJ^j453=xoji)Z`LB_2x~@sjhj+NA$zq(_F4#gW%Ur4u`;U~jO)tg_P89+qyzH) zHRwjN4G@;KPM0FR&|y&4g}A&bb+zT)_N4dk^If%uWPy%Z>HTEyj&#b|j z|0!>I1=^8xPJhm54C)NxIsMpe1146BsK2NkvV;65w&%fr%W&pCh}#T>K`O)EdoDX? z#Yl1xRIWE4^F}<2F?4ieC{cj4M%VxGB(H3z%Q}c*EE#tK<6Zx6Mgow-xSssRWM}yi z*)xS!k`E_<+M=1zW1hMOXWIq581fZu`L1$#r37sCRKkZPBbjoM;sCrSQW-i;om@EZyVsMW-$Q8#Ew+#f&@B8E8l?xu9pn%{HeTPGpDLWLx)17g6bT z>15<6{FvlV|E(SZI_=!(rL9W(_ijOD%Kusx^23y!|3e*+?Y6r@|0KWvz2LX%{{L0f z`sQSx_t#t2#$roiX_=f2^DrP{Rl-4R{4Z}Hw-=x@R)v7%5Y@nd^b;J>Xu|M!1w=*0v(!H3+y zI#T1!ANjp1a95}9M>a>IDGnxhPydXkgKwmDunj+}r_USOcKl*Net;4VfyIN zQ0NwWU9_pp@NM=hqKy`xJ72d!*&Ez0s?akhifVRIoWn!cd-djP^>R>X(36)5aH%mOqhw(MBhPK`jG^QQ+!&k5`%mp1A#3PG%o#La` zT~_|S@7EtbH1{naF@t5T11C^YQP3W-R0Gk&Ed0b8X=gf7gp=)Rq4nmy+6DB$(;@ZP z@Ld0^BPqG@ukH!;eMMD>sj_iLM3i@*RK;|3B6Gi!7R6TpNy%RCTl)cfmN$Q?hgx!hz<=i2EG;qe?3K$Sc<`|qk5G@d4X#0qE@z!%EV3kK9OpRX-=2m^I7{e#xc)pjOsx~S=Mcyn}@*MCk>ml@xN(Er^qncKg6{gB~ zqt+8&YmsM6TnC2RH5*-Y1IK!&m=rhE39;cuk(8D8?!MS)PzOF_4(~l(l8W0I759mL zNiKL|l6mh*gvlgjx8?wZv7zp&U;t0;eK%j2uQ!*nXtQ`}FMefy^5UVe?IY^pi?5+)36 zixz_!C(?fwquvmOou7h zH`?MkFW>7BT#jWPaZq@Wqk}ErulD{rM^HppyUf1tq8NK#qP#(ys?l^Y%88Uy3HKo! z_<|*#W5ZoEAyCTL!q~0CLy4R@5koAQS%S;ge4a6c-@sarRc6^cu$_ z92+*`IkbO9+gcg~b`@v!u6MO}^Z4@ToFtByGYiZGSsjl`Ol%Wv;|cELbpNsHLr`gO zy8mD?6=YOTnW>({wnd=Xg_ir#VNFg&?cD4iSgmn^WyhNvXtb=tmqP9a$)78OJmb$ zJ`8OGt;whr?i@g=AyE0RvIM=uPyWZ4gS|!@iF2lG(p`JUsj%)NyR9$#(CwZ(E-K4LW z`Jvgh&PkPk6Y%WuDl0g#CN3T9+*HE(ID@s}%Cwx=oE*#TE_b129RjOmXWwjN`$>%0A*Mn5f(|oKbf6x2Trhc=_8wi;Z?LK}CTvIebh<|O*70RU`PvbS9GU7I zQ!05T!Buxm5rhhVGgE!pfZK7dmwE5wG565(Ed(D$@@lw`Wn1OfG$NQlIxbXT<-vSt zli9HF^q#7dU=ht2qV3D&5`J=6LHkBF?r9om%=+~htyB0is!5whL|e~mO72Zmqt&)| zOYKpyd-F-lZM9YjBqysrbMwknr;Vbkx(s^y6qsvLiH+K{6tv1yb9SB2_SS>}ZQ7gm z?t+SH5KaTm@ziUgJK9s9F!m`j!=m6+HRm){nx6+D7XTj1Dn8&7%pme6P{zWInicle+;Svn3wjy` zCSP1|Z*vz@A|IIm%hwXeGp=y8KkX94e{dB`vHX-T<+pi-nakyuA4KOK6& zeEE={^jU0tym_FnW8kn(k=wot+F*aif$_YCRB>kF_%n=29EB-74)hy45sG#s&v5s4 zDF6r~>4C#yf?c}&mC4wOyzC(d@|A?;7wz3(xS@Z4U{xK)i=`zVQRUWr@S({(s_vA% zvr9=3QJ(+&xmp*z2?%oziw(Tlv?x#_5z6p=!{IVwHq=EH3Ekkn!h~n$i(lpxYRs3X z)o3ed$AV!f(n#y@$P?HCC9>Ep+1S1V$I|U3g4y<{a5vH!Hde`!OX7!}ejV~_Hxq5D zwH&48V%x&3^&aLP+@<6h%KPfmYHm6~@pQF=azYvRBM9eewo{}1=SGu=g@Vh?Xq9-% zlg7J0sO`)$9OdZ+g{e3W$l*CoKD)xRi|1(8gmU@Hr;LIhBIe$mbm1fr<%P&uLY|gX zVbTg4W1EnW=VgNWaM}LQ{==A!N}@@dfwwJHJl!rQxha@XE=XQzyvq^R8eXnMK1Cca z$S7D`VA%mx=gvyX&0oavTC`*6RGYc*oCC@)XaYSF7T_Ix< z!YQ<+#`_I-FA}QKiDUWg%FPP(C*KlzF#Qpf*@`@IPIkfN3F1lt5{&x#6ri#^%R233 z@tio0!Bp}FKPG*_w%t!3?+9PBAZ#kvD!)65sH{|+JaDm#=~60DG?R!ET<$cWYvHGre*mN8t3iWcYDgN}y;IcOZ&nTP~d8l`= zfEukEt$Q&b;_xle>X$@6akAsb1191}f;s1V(PE(%IzNa1MmrLTL1<5mt-Ut&>_PXN zRFpYM)udBMHA&Swv6UhjvD#6 zPFoXt*@(ac@^aVSy!(>@vmr=6=rtD5V2K>5A#r?>u*M3%#bAv~j~j3L2J#yfx`pmw zDwQB7-gx1B{1<$4v;}E$Y(koV@C~@HHw#OiT4Kwd}O|TO2xV7|h0>`y@r<%Ir&JV(<3pGYX|kArT$&4a zr|XtNpEH*|RLsS(9Tj_+#M=5R1x@|IL;od$nX%mZ7YpEwZ2cR4%@MO{MD|<7>T}Qg z^-(POoo!>0gf%Q-4pewcJktCMwnsvdUGzshBcAMSj$Ox9oJVyLD7eXAPzSy{`G7Bi zNZhy{sL;n9!tkW_yKJ(UfTw4@s?`Uhz;oF6D2fF}bR6z|G5eXAtA6+w?k~ck&I=xH zI#7(ox$xI%#3Jw}6XzmomjU%KUX8 z4Wn#;HQY)soyXIQZ`VE4Y1@tIPu#L}Qg3vflPPwdhHj$9GvHq=q!7K_sr%0oC~?P! zAR(Q)Fcj3Vkbu^n;-N>WfqWIj@!c4c0R<*h-VB?n_xsz5WyC|3$fjO`RB6NkHCjDv z3gyDdqJtf?>YS?S~M`koG0<8KN(o}#;ssCpxkkdj2WGKnS$lp60b6srD~?B7K~az#*`TaAGy z{^X%A?ifz5WqAv3mvUkaD)@17j;0jq$-NThMJ%={v@a9&W9&Q8%ZIw3RMD>kDhUF+ zm%RZvB8uqcL0J4aa0>OK(I{}0NaW#kU!=n)_+{cygC?nPu3OwzO)dR?E{Ntr%T9rk z*#T`kOp`Q^X7=*8(1Oqas|2v4isM^89B5; z92X)Ob%O$h%ikf(_YlfUw=aJ+Q+>(9SokHRJ~&4_5#F$Wjvn>-JB=uxirY14BKD3L zzAdx?(lx6)q^!5v|M4SVVVPfUDrQC{$?R2iBStB7oW5Z~kR zJHCc=(G7#utyc(=+u@mm;wQ;3;Ht%L0wwzQ)xm|_FW^+;PP1;QVjbHSjWOv2{c^ve zQh89tVW~g}!p>3vJKe+<2!t}z;je;!Yi2cMThOC4-h%QR80Yk>jJfK${ zwk9YRgAR{A0_(YsRUjx9;aE_zds5LQZ;_)g{Qg!62#G&IYos$eZ42nEpYaSFjJK}- zs5c6fs`yiR*sc)B5dx*1fp=4YJpHL);$>nHx+{kA@zd}EdQ+7A6Vs^S>_BLZSP?>T zB&vt|29TSo$O&9mt;EhEK*Lj!#a&iYi)2uk%s z&?T011mS+@m$SO58(5aIXj~+)oQlTX+Kg{uG7(vrUO)0$A~JPE z==)~taE=g}o!}!(PP9C}IBC3Dm3!L27P9oO?<;MT{^2fBb=o>cG|a!yN!VQos=g46 zgQ?MGsYkg&b#J#!mBB(4t3tM2wo)~7Pi1kVzG7~U=uaN}aG@l5K6oO6nab&?Z-hga zUF0RwK~X*0TMJ_4P}6&4S$~SnGmzhjAVNNM=gP`<>Ay zu|svr(v3hOs5{#es_L8;f9V?97CDYOmKJ+^sRov%R{73)_yvkN2P}6-Q%0Hh{skP$ ztZ6lX#M> z?ll(mGJPHka#XZvHQ0FqTKh7sn)CA6WWT;ZO=nv|li+gQr{NAZYDh#+dQ3jI(tFMfX{c z8%0i}K--LpQe2=3n?hl4!u>sLnB2~D-{U^be?_z3((~Xv=(5}yU6(%fse(b{V`_ML zX2HU2|GCh%?)I%?CBD_n%tZKvoeBsDNtd&i7E>3iIxv}Vajna0~=&e-@^ z3e|gzqhx|mo?Ec+tpA+!d8~ay_G(w1#ALU$YO-%AZ+c0Eio)N(=0VEZL`-w{P?udI2x`@E6;qs+i|(`_rs zPUNi1O?ck%p>s-^uFszMz+h-Q&|$E;j$OI;w!q!kc8cB5mQ|A$g7K56=5bV4+1O#$ z_4&$zK9E##C5mDyEefQEeF^5~R72f@$#CYfjr?{$5pogQQ;Yjib-u4a_1prNfvvV< zhOZ^@RwDXNhHp~Af)Z!w=jdLo6YB0JRFtHPabR*NaR1&{Y^PEUZFRF+yg%PBXs_%w z@IFTfvh38f$)xgxeQW=8wG5&tl4OsGgi1xv^*vV|FGOdl(Lyo`Hlfs4rJjw=fX0~Z z6a|+zcMG%qGO+bFw58f- zD51%$^NkFuKl3=

    Process Name \\", + " \\ Software Version
    CUSTOM_DUMPSOFTWAREVERSIONSpython3.12.0
    yaml6.0.1
    TOOL1tool10.11.9
    TOOL2tool21.9
    WorkflowNextflow
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    B9%`Hs#2S{H7z6evPRIO#Cpisk^-pZ~Jk>lxrp#?I}%^u4r z(9P>z4^ZpR|l@b7H@I?s-9x30HhmumH-?Eo`5Zevtw+8grRr zg#@{F+Rinu@h1;x&rE5L%Da)*6%0>0se;Q9pZb24c?WR@w|DQ$DCj6`uaqO>1U+p7 z`LQTjzfoFu{mA*KW+Oi@DvIAloj|q8sCb3jk1xn?2kz%9+P_nwZdZJt0%=bdPO%>> zXY*CTLRzEer8`$YdY=y*qs&*#J^6Znt9Pjv_JI{LJNu`If*|Mfgqp&3zuS#Cb-qXK z-C$v2nlus`D4L&*&rpJOMke7UK7ZeE6BP?BEhFMW4?uQGHn7c;YuOi-34}y@cPpVh z9T)K|r@CqO4yHenVxI1$*l(pj>GkU&+Vn^KElU8tzjw=ByVr|};}uS5xxKwx$crhO zAnjplJ5?shEkX-Kv)xK%=iiTaT*avb;D-YX$ip*-B1My4X^WqH4fD?xFd{ z<1NryR35XN4(haA^gyS9_X~ZGH8xd>vH)4g+nRIt13oy-Zve77!x8NEQLQ)5eD)*T zFxOk)Itmuvw|85FE27E%nA|)UVSJ86ch{DXSf8ETl$GD^KF^YW89alqb~fji&cW%AYni@0k~25>Bj8zFVp?{^4FL2 z2l_B3@swEbhtkfFK+S9qEkEF^T1KazZ$a4hq#302NkK<0HlC*M5cJdlhe5cVg2KFl z1vp3uFh&Fwt=u9P;dZDC{)nnul!%P9`_pB46|@-;2{J*6VwRlWUKvB#SK;FNM$hRs z?8PRK$Jg&_v$Y&_Ni2^CG{uhz0G(n`h#onst#+;AysOpKoV@n#HcaMpJcHSe`dcm~ zWe1u@PI)u^m<$60e~sIP*PJOuH)%TG7~`&p`=&tJcux?cb1GC1|jnc zh(5Y9#r$Od+ZjriyHd;e)N|8u4?~f@J-(SHK#w61yJIMi`{m>T7CmA+Ovl0C(q&|2 z95&7KGEcXt-2v)Gv%LIv;6~?RJ7XS_3WE8fqM3Ah#2o)03>Byn|#PR3ov0shN zWZ>CY7g*@BnKKHEIYZrIlI0@y_Qz8WWE60DGnpG#wJDqxauGoa>4Puf4ajU%fv$0J z&6YvVX8~a%R-@`hB_?9#rj@fMcu;2Kjkh?wmX+?(mX{Wm;aJ)s_gyAr%{Mj&e1H%x z4wWC8JOa9)^+TZY4#Mx2P%v|euFEbAnrHb+&cOIVNsJd*^&l979$y-fqAF5xcoy!| zJqX~tXh@*sz|Gl`N^M& z0%&gOL3_77R41fkP^=g3g+nldW&-U0>!ZF+Gownvt9L3wm!aE3x_x4;-Uu82!qS_{ zt?mvnssySS*{Bj`Tj&B3;<^=7nE-Y5wJ7X3E9Bp&fXW`YS@XVW5lT`N>FiKYFf2XT z7o71vz`_Cy;4}#7qSQFCUcrEJwG86pNNk#C$+mP~g3B+}C5eRn;4vdF2znwDDA_w* zINOmd&NHb=dk+T>d8#jgG7U7tYVj)^Uf}xHUFt(YB~$2`XYJh$$cB#8-_q+j?*&iY zn^B+un)y-h&Bzf6hlm92mot(UP&kqq0V#EX?SbCF3bG-Ay}Tc`0MjP)MNne)*7WXC z*_KIt$hyA~1Kum#oRLwWJRTPTN zer&qDpDtoD3|Za{^czT&mxP*EE@Bd@5+ro&0AocJC&pJW2tVW5^1o|K(M66BlF7=w z3P4F!8wD0X@;l3v=#ZgrMz6LpfYjh!ZSXr3iX72T_rpd@VK+1Y9)ViSco2Vhhn%Vr z95eqbJ<@G#TRU^veuuglrHO!!esmdYvAaN5pFvNn&{SY*6P|vs7W~J9wbH+i3h{ux)f^uSV!vAY&PJMY_*!g zd?D)Wy!^rL@x`36d!Od*F^w2_8K00N&C&O-K%RicM){hE8*S6*4uNF@LY5eBmb&x$ zF8AFrt(h_|D|tXW9Za5)G*ytuGL{#PLWx^}0vzGpTIp9bw7clJa> zeSBWA-$`;NamA9L0{#O2Z5$3>m2E!RB5rC8UUtD0tPaKU&+cx=cY}e+fYGBYbg`_N z#3CVtWj}B$vA|A~20WZPtqD8Nj-CHNlrM3~Oam+54q9_=R#cR_>v=d(fEr+W`Cxj* zX)t2Kkxnd4Au30htCsU==4QC0oRB+4zJ&n}$~1jE89V>vlPCr>uJPnUlaD=E`y+>G z9@LE|U36ScVDJt0L5UT!ieTtz#NhPtrf?rY@{8H7Umn=Ncz+-{O^M?oSU0UP4GRby z;LG_w;t5%DeJ6a?|B=KVY@i3mQ2tVeJATOSdgiw=JAl zyZ9O8;2*o+!;Eb^fhqjen1jX@w!0%~!4?S!T0g-T9UzXQKEK?lk~s7uQ)sbKl0Q;t zIT^iq1uKT@jofnFS#aD`IUcT?>~yHlKse6ZParo=!Hr(&#NhBXW5VWh%E!jLqvQ4H zs^=8G^l|IM{_yP2szcTZqpo&e8Y&n_dm!*zK;W<3d;?*vtO8W|<2A8Rr)?tKN0cmA zG56F`)?|L3(|v6@niyk9SI44)L}vZ`_*6L&;r?a_3*3Pv%oDLaEGiJhQ79!HV9u1o zDb~XZb*JS;$XwxT=LtOVdGO$;mY^=zyUuWg`RVW=S!Rf`GDRPnHX(SUgblGu@?<7L zT#MYEAemJDM+dj8nt#+G+B2p#i98K%ryiIK^&|Yi&Kd~^!q|BijGq!_z7bp+b$wVZ%TqoL zG_bM>xE@~(;mFrzwLXc1i6pGk7E4&up>jKMY{z9=2h#+}sMZfj$>&iV3R_!f5{9wN zcjZXig%22TSn*ALj__Q52CSHCMo)tJK3f@}t==*0eChF3ai0G~_=Lg*aF319>d$lK z_d=jcv@&{QsjC(AVHZL4@+52`93+Hg`e4zyGpQHD4J(SDY-)x0yP5uvN(XxPh=WLG zuXWlA(R^x9E3nD#bnzC0bg%IcG}&;+JjaU(f7sHU^jQK?%_&m-PboUQr-v#bZ;KJZ zV=bE96GrnN4*uA`ZH^5YGsB+rq}sPcn0c}S^>J^_zHOb1zTOV}?3RvCt#$6+i+^zi z3;ehXl^ev%Cc*~Nkf6~R#AG%J2c8I#OQW?1EJI-jx6prLowkdCo?-?LrFYJX$xe9N zl)~b(QBmc@QekMKkKy;d+_r^_J)YfA&;0?84^ap*+&l_G`_7kPQg-<_H$=@BSv>=M|`;Q261QS~w{XQFQBU6?UV^(Z~YwNNnG1RUi>gtf9m)Shrhn8nA7v?H~mg z|I9?-Kra%qkyPCT-YdROrE4ph*8}aPxNZyHxR3#n;PL6-k)Y(TAJq0hxZl9WM^kA1 za%KlRks((Ok>J8PS5>&)N(e#@P}K8%UGVI9ldT#GpZsfbDd7N!8BxnL>seZXs3euR zX3S*v4w17rf!~m@#p{Q*utJiL{kR=UN?cqMMhkB{-weDM&gI2Kw*@DCU3wT#L27pP z1vKV_CXV-_S*PPIK1|n&FFI;Ds}uY$J5PM?pk8)f{C*clP{J4xlrjpGf{p2l&}pBB zF-*oi>>Uj3VG3*=Wc8K24En9QG9OW&!`@(wvx%_qhf6t&*JVE%K;y&SXOt)X$0ELf z1~oq1OpOZm8sm5sO8_wBSkx|%{c;{7<#jNqr(qCgH(jxfk}z#70A~w z@2esiMs@IeKi#Fla7<-J!9vx$o6SsHAz>JSESpF@XVebbe)k^ibKL^IRV0F6>NR~| zBJcGY`!NZW5cN(*!_?AMc+jP@L)Mj}#o?q9tee9>2e;)V%46uRZiNe~S@szfxwu`@ z@&7@~SrbrLP~?#0DXw5Pj3m!k1oYPv<@2CG(W_a8+UVCQVDF&!1kaeWYIc8v^e9U5 zaG@*-i1TrS^Hj$Sv#4O835NJ4E9@>@Tfp-ob-!@Pf1Xq{vAu3-3R3v&fItJYc1q+% zD6N(Pw(c8f0U)Iiu9#l%W`_jdhCZYoU=k%AO($VJKwc@yX1$33SAA)x0zR2!99$U0jg>pm9R!&(#Z7Ucdu1KX`;6iIoHie$S%*=aIT zzS3H;1xSOlv&Kj|Qtn+AQa&EWy4^qy;c2VH*rV6e?7I|88WjNecy0j-HOR5OwhYp!_n~g`f!O2Br5Fg7g$%ti7P#-tw%)zahvQWk!F@ef0gQsC}O;PM8S$w%{L66NeM`$*9F z#nj&dGwk6iTPufJhsr~b?~1*?`!6Iz;ICd=GEY>2=kI{IuXs?^+4_->MppKT2$5rt z>2HPSl&V^N?TXm<7sW7Z2PmoXzABOJQ(@Jz^on`#GI_giRK9I>zx3ZI^woLsZAp5j~7cMrE)nC}{s zUzAI1s#h9|UKbirT#Z|gAFHffbknRK5y<}AWf?vSOE`e8WDpmxwFLX#WICnd%rF#}}jiIwM}oM`JWMjC&|c z&JMrVL|zs79VlDdc=x5NOZtvmmKOniGz?+7{{`h>vWxH|5zK}^Rx#UHvD;Z>bW+?b4&l)OkTjw{16IQ^4u> zVT2(IUTtUgNQa<(qb)3iCASu;Y9h??!5rbA8=CZ;ktOBucej;vWELln87A%-B`ke> zsrg*Q*R*5?)VpGP*lw=mB_=bDtthK*8}0Z#k6gM6`6a9|=fX*{YWjFX^8I%()}?`6 zcPCV7(gHOGY)7jn@tS6$Du0*wJgIjTHIA9Rg)>U4K`$p_=ee-+51Y*+jIlhX*sL_7n830RAy3tsK8nj_ z8DBqAot@;e9CnFG?=LYm+Utu$x8m2`*E07yX}IC6D=w?RMBem|{mesjFso>Gr(}2S z##(AKT|{RDS|yQUi=BTAOJBvtU&X?p?qYOkga>U<%}x=#BeG;|rheqU-fYQQ30eUZ z@mE-#&WiqnYaQn4ME^ueSF#Ipwyi75Or%dro+RiET4a8Oz+qv4af%mkrs0&&&2kaF z$poKbB21f^h|AaCmRimsuJ9p+YmHpT+{B*-<3^z-VGyM{CzkZe^BO^0_y4qa?f+2a z|9|9DcGos`x1zQ|WzfYXbTdS4MJJ+G6G9Y{w8#w85a}XaH0n$#6{;!LluJo6!!%{3 z6VWue#UV_Q%aIY2#^v)GeIEPx{s-S*nx8y8&YbsozuvF=>-juWdr3z#q~a|3fhWKF z5nzz`apssk6-Jp^S}-Ew!$ij4EU(pj{8JaniI|ldC5|rZnpg3ntUVy2fCI8W%K zX3%T_J842?@S*`f>7;M-fKI0EHK&5Om7$zxo~hb(<6-pDQof!I?bW?4D)d?-edvkv znN#jY?9JQ5>q+s3{$~1m40f#dd6`HuKcyK#%y!Of+xaJb$&ig79gyk10dKK=X>jw| zZHJS|OT~>*7o1UwJq>n5g0+$e=*+I>F{)lILRzkUM7ja{W)7Wi-a6g0eA;dP&aL#v z8Nn6<;Ew}PZiSKmJUFY=;zR#by`NQ&3oU20uXRCHnMc;q;%_=#xT1m*zTWPfEXPwi_15%HvhtxGdZZJ2 z2Ze-EN1zR2JmjL#~L*cZ5Y8?Tyn4PHy-&YnZHN&z`5eQw+CI{Nv?t9 zdS1v%dApl+Q-2dcSA!gXWG4&3tJT%L&OY!g}x$qIf*p3<-vR||hC4DJU^ZN)R zp%>me=i{-lbo@$yVRER=374vE7XbOGaIXaIm!jq#~v_rw3tm@ zByn^%5u@rgh0RV2jMv`?z;SyE+H+D+V5wb3u1~fmpq-N=zrIg07;K@2Ia~I0n5R^w zP?LSV^BM;e>5;CPeffYBC_;#;&ku+?s=}xdDnYGI`v(Vq`YNL)SASln2u4JK1JTi$ z2^JG%QdMGc$w1Zn+D~3*&KNNMMBPnteY6%o!OzK-;P};FZ}IgmmfkK3GGgyT-zH5A zCqdz55NjbNLVCIi>4iMu;V9<0&BN^yRT6U9W6QgEL5w?n3@m_CeSezE%e4>J=kk4! zhvvDuNK_x=FGPMaHh0GE1IPV4GB{*X&kOICZ6D+%-l(iEH7cjAi*o5>*CQ)#Nz_1) z51kdTScDfI921t;ds&T*_BVG%2C@3RFEn$!yeopfDjOMn>||gPRRk<9O-s){^iSJJ zk3=oLFlt?0(I+m|-i>Z-G;+$ClFWXfZPQ=6kI8xYfK%}h9(53Gb#0^~OVtSk@2qKU zCS7nc;g|U3foA5j7;J>880HJ8Sk2k$`!^R$$=ebS?cZ@RuE|Ki1_zTf)(A%Xj*ht8 z;p?yzvAi;h|Ad*wqH=IH`Rs>DWbyuMD*O2q`N7imO$>T)%GJmM8>|pQR7l@jXW`sj zVGy7+_|XJd{3bfUGQ1pA<+Go8f<75x`DVyoOD3^^!&$;G*NBlWN|^( zxQ7lfIfetzsqWir%g^|WwmZ;QFURx)$2foYO`ZZ7z=vf61xT~=v?{U(S@jGF_bRn| z-fKd9qmDGgvj;e#E2$$_`Cl=~E~t9{%H{(mH4AakOACtcgR3CPJGsG&?8$1qDl7T- zSstS@EyvFI&7VhL9n@PD;l7@r>fhz((Q8BQbU%_DAv@A_-Rb5Cgq+70hngCLv?j~Y z>_wyD9s=b$#LkAM0Y!aTSw1|^?ih55Ln6t8vNA@|MRDrBhm14O2{(FJ9$)IH5xt5Y z2_QEgx{mRPkecBf@!_2Z?=M}bj!5s;>=|PI=+u#=!kx}=00ux+V1qmBa3LSCP^iT7 z-E3tUOfG2u5kCz?;`T648V*0FhPlW;@CEa+Aj`g_&*enR=bI5K?nj3b8yHBoO{nuc$?oT71C0N%3h0hIBFd6@8W@&dONsVzgTZ` z-RV6s9~-2?=<(ZI~3m=8iA_YJUWf}m#eO;h9aThWi{p@CoT7IY5%invsLo+mzq zxV&IKXn5>VVrPl*wCe?WlE_B{<=g&X9J~M@6Y--Dph3ydu3uDh9AsaUkV%z*uGNT+ zp_GZcqiaqY_>su%$9U@EY04K}uwjx0jA?)EA##wNDFO9eCZv3#D)8WSqUF47RAT*Z z_9J^t2?y1X?9^Vc@Xq_pf3=MF8i$!~Wfw)930+krwer&%JT{mT^G`S1mlZZwcgnjb z0%bZiKr!D_oDDZDzo-#MZjye!lk$^)U9x?BBb?RS#T1skD=^Ujh< zb7kG%)^TV{JS%@eElP$uhYG!b9)XgehEY_vOgLjeQA_!g)p|k3;4#G0zHtBj%Jh2; z?ZYDoF*j{0sV=SfXRSB5zEebF7bW}xOf6SBCPI1n*p^13+}1^Me}^i9^v(}*^QCxf z2&!Y)XibBJB~^E+I#|y?u7=RE`C-)`Fy<8rS;+t`dZuNE&}ux>}FDc z-2@P#`eAb?fddxIQgqFOromCFf8IHQ_4ZhEpM;I$P;{*d2l@RKg~O|84R%!r4??r*-)^v(-D3_0ach)X9=O@S2kOurV@}R2AN=iQTBLE;NUs@0m=%U?P#*yo{#VTd@Hj ztA^`VJTFEFJ>4nfcFwBdSr9$uB6GN6bM zJA5RRXo)38fSr?ew&JT*xtF*NBq?PoywGbpD-Ze+;P=!%SUL-oE!3Kg5oKK_=VY&) zZFZ6FULZs?%nO`Y;=nyZw2!I68h^|p_J4MYu)=DXCG~;4g8Y=0 zMm!J9z;8wgEvnWZ5Qvw|IBZ}}T-Hs{KhZ^8MQshSv%CV5SOACrn#({-{F&Ny&j@NK zOEHyqjI{p*$SvD1W>B?*NIWMVhV)fXH)0?3R_EC}wu#Q&-?I6&Awgnv6dZp2lF4dw#F>in4kr2&#Y+wl}h91il< zuE1AYV8#wXyKZ}5$?o+(Ki4A!j}6MyHF`{=X@ z4mGmRSuQ8yD!@)iq`BBlQ>ScU9wPSv9?h+aOttu-J!WOMgke2Imf=WaLto;$0t3w4 zh`oc=YOzXF{JI+}Myg%!hcNZFzxWFgWeLr7Cqc115Xl4Wl`&>j55xyw{%=+22R8!E@Iv}^S zAu-qhUQ7I2XznW4O>NOsXmQFsytvmc1-oZdIwq=JE+9mYg*6DEnG~Grvi-d-a_E>! z@Sps_3||!Nt-Nc(nE@Q*BA~uyW>u#aDN*msva3R+DIVL}wT49boxwItt#rIDWSO>T zPFP|>ZZCy$>LswpfUA>(j}ZhRIYQdLf#rK8+e3K+lrB2MNq(G&^I{19&dmDPV3Y;~rhj<7SA#EeJSXE7)X;7 zO#)>OAlX-~0MAUsMWX#k#8{WDNm>tTn&v*2f!M+6;6hKbC!t(B6io;_3YW2)^S)3k1R!pO+0K8y6o4r|RwXLKT6u zbz`1*tS75qPxrLJo+y$=fSLJf2Aj)N;AeCX1GW$1+U%<*dCYYv!)lhSC3~L6QU}&y z6RE|OPKSXYymzDA1`#oU;#SL32Rlg$kfdCEjSB@ze|#(twKc#TE%BfIlIC(tETqz! zuwDYnuPJ_mmn)qrpmDSHsBoc_W8WDNy}$?Crw0mhSf!h8U{XZOa@tI>pohTmSR}K( zW(lx;yN3^P#-0)5*@TGA1bd1P42Kjm{4P^bh~(z*>y1Z>QqxDHzcy?OX{)2^Z$Pnb0W;}7Kbncw2Sl4+Td`DIt8j_*084*!L@;Z~;wdKC<@?8= z6S%Eb3Th}zI8eiw)toMSP1Wxs@;GEriTtEddF2bQ{e=@k0P?696Wywo$LAYQwpaS7 zn9?11Up9@Tmr`7^sOtTkF2aTjlY=IHw+32b0`y<&z`glHg^-~mcIh%VyfiXgk3^Q* zO&xSYOBTCj4=3HrUgcawEw;ssg~}6>U0|arX-~C6ZVMqk)ltK8_*Ryyhlbjzt{ek` zTmi@vB!Z5*vJ{cW2J?$c+~cS3G7Fo{V1E!OyQ%KxQR1)TRP&eH#;qd*a+l{DV6}){ znoLfJf?(YhYg9=$cp^G|f(C0FB6_Qr%~pZ~(m}mbWv2;IEL=s3_Z%^v_>w%ail+Xv zJG@F@N5E(swZ=K3ylfd{j2q`VXMqf>RP~$gSS8g z9~tmO($i(K8#kUkT7v0tN7zj!TD&;MBUKF{h>Jz$tF%Nh}pxm5YCY9UGYbd#m& z;_={2MK$iC)$HH-+HGy&I2>!-u(h&_0)8J z+#i`6Dy+EImM(kqOkggZv3+I0j*X)?4W^Skwtujs(j))KR6Mb=)IFxS<#D$;C3x!r zS8F1Asy>Olfxg5LCzsQ?e^t}!8UqU z(2jsZyU_|GYg5eseuV-2R18y#j7D~3*5nm2 z6+Y?=u*$SFGi? zW_#a~^vfTgm&E2d21I8-Enj+|+0NbaVx-=ozA9#O#c+8~de6A??ME@%^`$X8g)s^X zrM^b|^zC#n%UQvSQWqC_jdq@~Y3}L*W=-U1`&%nFMJ#Tm$?9kHy;x^|i5S^0@XrW| z8Lq}BE(L!eQcD$qTQ~L-$`#{65U6<178Oiw@)JKNwPS?}gfaQafB1jindI>OZ_4-zJJ1((UoVapg_XWi7+v;&7{uOEg+B)`=Z$Tl}$=DJOaG5*6DSMvY(Zpl#T z!hA`tnnPH#7$&74jDU=5S9}-IEZ6s*tFy9zP3JzFZ8Ph_8a&tPgWVU+ZRA?GE!M5{)xQ!$>39(g%@z9(slY0O>(%77{r z)2-oKx?DvM`Ao=&C5IN`K0)RdUz^n?WC#NkA)1v2y98N;_s>0V^xO12w#I$}@(uTT zyVyJI@P^C^5T*|Nvrj~K3?pd6^?`PWD;|_w`l*MM3c#iK0Q?5(h3~&yx6!H;$W`@n zN0$y(1^it-)u{hhILzo)pa{K2$w%hMqh7RDQQ}_;RTr6s1R^N?Hv`*|wQmriqqI}o zk&+N}(swiGAHjO$?1VKKp%BOVkqY=@)dSP5fBze(HLSCNTosN@r5`9q>&W88GI`+k z=;=~T&ZokclCpJkrfx&hz}P^X9c4L;W{imkj^XTDsW|ZtMsI5Nd0#w)nZP*9ftvSOZe22MDnF0!nQL- zB9?t68#KYy2MfQQoj&$q&zt`I0VwX>7V}wH&yj#`Ux4imWAp3TTp28261U-9;a(I$ zw|+0K#B!4|g)0^68l+rPRc0M*d#?8B zp&OeX?+;iF{Izn!RTFdDXGd2z`DV#4DROuBe~AZ_o3)GCmfCy2@$Xk16Y=)0*yYmT)Ua z!|2ReCon!e+=18nVD$zSryx$cso+to_uwZ@-EmPuupRyyC(TZ z@IqP|4x?oQ8JCsde26!BIZvPjnVx!=!3!2fix+kB^7soQua?KoW^II^wJXSE_9}&o zt$AX5Myywu2bkT+hj${WFF$gQkvoVyXmObL=nT@9-MeWvko_4>*;?!H7ANSKE_u-( zaJi*0>`e!=_s8#|!}4sl4gAhQXQZd(!x4OeQ8fPAJYTYdUfn5yl|g@JyWo)0fm;;M zjdV8Q-JLWBg5~@TZv@O0c=Um)gzhD6aq$InD`$CN76jmG(6yQ{4(ISR5Gs|xw{MQf{&rceGMNu|G5 zEwvHL+$z#*`g{yc6A>K9aau#LG)FmL)clF2`p|voGTd>*$&E03c)&{(=Z@m~9RsF# zYqcmud{aP;H$@}F(YZ|W8+RaYz30jG@9l6ud_f8iJtku_-WTId?-7anjw2JtECwV2 zynxx!|DY2^8*uB;ry;_cRp&H@Y*CgzEH)jES((+fQt z1My9H@Etw8EU*bN%AI%e9df>DkTQQK$Uh{EG8Nxkea{vp%TVldmf}NSuV$Xh06G8N zwCO(Kbk~JQl_H8nOckCbA}tVZo1Jm5iYBhw(If~H$t?=@LP#=t7}yuTpRu;Mapq}d zK1?0tlq)+qrNS{rt%;J-kFIV-_Ur4_S7EG)pA^Rwj)4%>i8 z;?4=TXjS;ed`HfAAGKmfud~e-HXH@A^YRbN3)A{ff5=Dt8~@nNeq9lUp%Z^f16Oeg zbb1ju`otg?p@n)=nQv)A@7OQ@;=ZwxD6sq65|s=On?63dyEdLyb}Qi zTCUmFn2d?5W|D0`vLQ&_urW+ zzNs_)qE`iZkM9H!C}KB}(nL70XOp8`L8*RNyL+d7XiSd_SW2N z%g=Vj=r!27thH~(&?i4!D@AVRa3>Z z7A0u4U1O__BTO81&>C+)m5ix14ns%Ejxa3Z4Nyou{N#kLw@ug4J!}tVB{K#ex1Cq_ z#r~Ku2Mf*x{1g*dt`7JaJuzx!Kh^rPdDP+h9sW+j#bn?s=?ut$=fWzkrF7Vyqi)ip zGT=A_c8wm|Qy0zTJHvJ(>6Org3B%?U(xjQ@F*a825u?h&?eO=t5FM`T6%MKd0EbE-OU2{& zyc=f8+owY?MM_Ww-FnWyD|-8L`s*y~t?$&)TZV6&w-poD6}zIbL5)QWJ&t?$8eTWM zW)GfYr3nZ41(IH=K%)P1;UVzn!Ya~mXx{R3?TlZiSi+^l8o?oD33>pPFjw2T9Dp}b zNQp{IIxf6S@M>Ey9)HVVl+7jKjRo|9VQqgQ!CH5{FK%9SY|tp$9jo{IxNo!#HdLXw z{djC;=a>DH2^ockF+N&f)+{-I3BwwpReX^mTQ32rp74yI;Zfvm!F8t^t>DF zF3)=B2mA+aJkwu?7U=ZPMa3@+<^*OmI*nkUkcJiTK#u#6d?Y1RO*Ip?aZ+ae0iq}J zvTeQF_KG~wBQu6M2BTCoP5EPCKlEJ}?F(R3aX>TThJj%Qz=UjLl5eKB9{1EadVF^$ z_zO^XyA<`4hZ%*4<$gy@6dMXxO{yJK(V5^O);Nj|4`&7PJ?6y*Pg;ZAp1}7-iZFIU zCElS1cU8+LONe73#zyWh7v7(q=bd=f`5VY;G|@al1}F~3I8g>=Y}bARPdnsE@nly| zL{1kH!VbSd8i`-bavS2W@PX&V_MRVZg|bN;F2sH|c)Tu}hGVdChh6gQ^7V{=qUv;e z<5CTe2Xldm(1<{NY8)|oTWF?O^ih$Z7=#1?J=XH8^?U1vU5Dn#-*BC@Ys!q~f8z(b zvNxoDn5_Fgg)*Y#A;QZbAA()bEep?tGERH4H}g90dT>g#HPKOFZy$_dwkN5EgA+JVr2o!5fs3Un?r z<=Ys^xKDvFQ89p%p3-vZET{zoUcSJ!6g#rweRReB#ue)k@nrU4dtsF$ zLn9T!t{%h%90JryU&RRtjPeT%W@mu~R2MXl=0k?&T(u+AHbd`0a)g%a=05#8cI^x9 zyVsDr#~r->+i{ALW_$9#Diy8vHt3%GsZ?MhjUw zN%`<1T@;a)5)vcJl`M4n%OK(5+DkoLMOcyL5Rtt;y4s@V`T|eD12Ta^IRyyzH@HQ^ zS}sY;2o1$_o-pseMV6nl1uWRNvEY3qRg1bBA&O}vj(!nm0VvSbfvl5KU8QkI2{w-e z&vAZ{Ng-Is1&V8Iz$$bQPes00y6AWql8L`>`D>4^NyO-gtE)1nS7QyGf8j|B;q5c{ zol%)E6Xp#VtTQ*sUW$olGCXWEelMkKub`nj0|_8Bf&$cuZYB2Ed(yK`^*OfLwa5C) zx|`Gkx8IDh7WhcDSI2we>+h5l9`DAvoz)RQE{7YI6K-Gq`202`)G*;Vz5_Lp()X`b z(&q2F2iKJ{Dr1@^!bZ+6xGfuR2PF}Zjm&r@2KI|L$sqtXf^wES<4>vkiS7_2Sa$+r%yj^1&J9-4MP+(+eP2bY8s}|46j7Hx2i*_GfdOS)D zJ|AT%t`eBAZmpxwpum!Wv57!+zovkUnQkL=5Rl-6a#_D}*@h$D?%b(cyk#na)%vrg z4q=9%{V!w$`MciB+;u~hi%)`0`g%{$$Oyn^cR!oP5^7< zD20m(rHNZ>kb|WgylThD^vfS>nq4rrfR-k5_cztE*?^Qj0z_zpV`INV1Bj(`Ymn$n;vsMD>Bu&zyf8zb9+UTEY#lLoc zTI*Z&+S25E4vIcgxAG{z{vP$M7)2UQd z&z3FfFok=S1;qw*1%+O&aVxt<(MyiP-LzCdJLuZcV-G zM-uR*mp|qG5cKuuC@)h8RLNZrQ#i3{`ZJ;{<9=US zPM(`#@SXgQb6HV{MT5o-pHXy3j3=+`A4}tPxhxJ`69SppkKmlpJEnnoqSqo1|0d@1 z=AC3YiG-Ga36b}`%Vt9r3-Wm6rL8SqmLI>HQhh{ zFJ(3Ama2|)28Ec1yJo}VWdXA=S`W+mJR=B^&pTOl>=!KniX{i}O-2A|Bv{_oa9Fz`hVs1Ifx8)i&gePP61gorQxW?edrl5AppRJ!{tf8P+>~o z97KA7PACSD)5pr^753hN-6RC$u3aWWzYPJe)|8LeXl@&qF-Gx73<(bWaT*}?b`+Za za9s^lCl0N$rRsPO!dvY(vZ&*Zb~J`v(W)#f@4dn4J^ILCy>H&G=1iX+o&yaYcKFx* z)GAeH@>x-P{nwpEWJi~)L>)uDaX=ct#FfxyS!ngGtS29#B4+ITYR8et*&OdN+`lSB z@}-kHK+LW@k_)p5{YoSDgzs}(@jdDKla=?m53K*TWTHFx{-<=3SsS2_9loBLR9+Fp?VVQHw*Oxrkqrc0@uFbT$(P>A z(v^ev$es>~PVhox{>>i#)J&Tfg#KE0?aA+@Hw95GPmYEk&$Mb*-rx ze~GMm2oZXbXMbP;H7yq<>hGK6-?6VPKhGy{u^g|(cWvmzIY5>zd^0U!Go9tdeLgL(xVCGEU(*ZPr0^XF|(X^QR zR7OFF3T*AU?gCV)LuHR@ssI#U0`|s)yi~BH4AD%PX3ywYh)(b)ZARSpx`m(l z-e0?=99$Owg;nJcql7+x*xKCt!HphWBA-=}{+}XUV{%A=LYkd)sph!j`TgrrsaI90 z!Z+=JDjsjP`<9@u%_6kwoPLXcVVGX#$&@t&@`61uXdd9Xrfb11edYTmDcZ||a*sg^ zc=W4^<%74xSwCh5HgkHy$8N0(7SHTYFU_z*aIYDNc5cp+(Zf_rs6>z(X%Cl`a~=!? zcUA_R=fcqaB(%rLK6wMVhwCYx-p~uJi{qU292L8}b{;Q$jqTeanuiZXS~XAklV>#i zJc2(svu{8}xIG-4%Q}SePKeaR4|OWEt1Y`eJ(94U{xUBKc4koKO-|a=_NbqD21|_1 zm~GQ?iYa9D2N#rNKNB%prDjwowQRkm(@n$@lSeVy+m9~!J-KQ>q(-ulZhv~^I4hz?*e1M6;vM#hf{dJuu8>yg2d=;!P}xAHbK2xxza{_9vf39Z&~Q^gQBA(l7k- z;^;>X!%X~APpOArs!Hw~bBEL_fBRrE=u;HlT6V*z z1JLBEg`1!~V<9n5v^=hbh<^V|QEAd~{-)Z#KNtsQNe0F9@#s_BWdDVo0$pkj?}>R4 z=f+LYQrtyld|jgh6l5mVd(3u}DndD&R;JjJN!7a>^znJ8__quG8$VI(ZBCA?^WVNQ zqREdr-Q_?!cGe-M--R8{rN~Zbs1O-Tlv`+b1f-d_&{BBd#gz<9#=~dL8_5GhM-XpZ z9|RJSlWeyV!W(XIQ8sDfW2@TdBc^EKEPfs4xrT92 z{^ID|(4yzqQ5Q~+o%dRCQ3>%dX35lHuRAu}wl#}4eOvcVw&VjwKI7|cv`%;(XB?f? z%2~ftP&-THaug|4=;=$m7ZkB2oJONgch`oij8~IDYeyFpdIxg$_aAd|K&jd{@oGCE=(k^vsh*!<>94oV^8Nv3*5sAnKi1rkr?EEB-Vpd09{RfsAtkkkAS< zn*VD#X8>;kUC$0KwBxY6xF|=Bp=gCL>p-1l6DeOUx(?HCnw)W++g#7Q~3f;F*CSXtvtN*5__gm?&`s4Je z{#yY_@BDA2r{qV}-?bX@a{dnn)W`Qop(Ojc7hJ%78WMsYrdjJJ0_&s>Tmw83v62>=IR1c>59z>L4{M+jlrph2lmdoIv3SytUF zVd=0EAt(F>hARrF1j>|^EWF@j7MdCud+CfH!)eJkD_jvElW|S4?+62M=I0Y|S3)1@ zQPOKhT6cA_TFd_IHuKa5AO_4(tUD&0h^L8#eL8`!nddw(8w47tO)kFV!fpUlIT zllNh(EWz$>rTWoB*`f{f0W?t1X&%4QQAN(ky=a|6bfVB^-+q0#7&5KuV}{fBHlHKE z)L;UX&YI*FGtO5*;k|S8RkSw4+<@BDp!&_-pQIRy%r$}+GPM{wXg_TpWolCS(J#TT zF%%nQx2|`=$Dm0ij=Z3GoI9K=aj5mY`tYmwo8d&vc)SOfj2NC+I%?#v_$Kiq)_9nw zg!p&CHWVdmgsOEpnz+{*HQ2p)mQ)w*)l8f2YZz-PTjrABiajV**RGgwj%uNksz`&RqPK>`7;cuj?u>tYQ>Hg~ER(B0uPYKiUkd=pDAKk2 z5sgn~%TySjtF5r<>?v8vet2uA7gJ8+4Vcpi*eiXmXUcttEP!n9?nf|j#>mc7HzOIs z#3$}$6${ho!mD$Ox6=2tHLXl%pFC0v|0{O;`5Y#t{V-ldZNI}rcG>0p@cxLqUlA!m zGN;*WMy-@I$7HOj8HP?(6Cq325x&E9A4ec-JH%vNrTxU-k)qymT*NI#2<=D?B+eU4 zRY8TP|EZeECv=w>vLsi|`WL*$Z$G}@Z@LsGoRb-OLTm2?Uab&GkKw6yPCISLkF2>` zs(qe0te8Z7_R8UrCFu$|cla}>B$C4i)6QhfTB)Qaox>)<3^ybZ4xRS{SX1#tEAg%h z9`!YiTlxV+6a@LcB=Q3V#-1$o5!F5{+THh1C^lKC$!C`pQoo5ZtAZBGlA zGx)%TID(nZ&4|5a=zSVMKWad&;!qGj&?ivd2>ogpZq@%N8-eZs)4 zc=z;VK6VxFIWBq1?Ub7FJ~qkEw|mUY`|q?<+`f&qGJL9tMg%*{kw%9wtrsO;LsYhM zIYuh9SWAAYOp$roaSM@DKR}5wO1p>}>cXW)s_LlVY01s_<)DL2(n*)Tr7M+Iwb=54 z76IJA2Li~*pF_iuw8otAuPKEm6)b5l`Pgr;zSLGppi!aw8^Sd@C;6pyu;r9n;v+Re z=;5w;!Y7jtSi{WtrGlTtu?_mb3s07#F}R;cIvndteS3jbh;FIavAf|2WG^LBfX{w7 z@sE~tKu66o;V@K*kH|^Ngaa@4h_%rR%VPT@3xx$SDr9Ioix1z24U7Rh==K?sU6J_7 zoTL}7*^Hb^!nH0N+Uaw)_+C;k0_I`!wRt8S@flibjL%Ha_d+||i~lG@Fm<#H-IgMW z5`-?%4F0SjyiRw~ zNv9mqST=FaOWErAwy@fkKB_&Go#%Y^!3SSbY+ybaJu8}kv2~LXoxf^}s{QeZR?kF< z5=$&Y?HLKiXC3?T>sX%2C_&FpXlo&BKqHq7B#c>VUo^Yc4r1)~;%QrzN`;@qb_X*1 zdJJieWUCDP_0n*w47+`4?}8RatU71W%YVkI{H2Z@#XQxFr&mT;W1Fn@1jWZUcBU(r z+uuT;Z9MoiD%t5#>F)8CAfde!+SR8-Ojkfh1?I`~J$coHLG~f2aYDX6@&pLBvJdZ1kvl#AL{*)tBKB@X%awACF z!dupAs`x2N$%ggt#>Fy_Q$sV>wF$R5SC6dgQ%lK<%xb$|7v1?zXK1-g4t!7k6QBW; zbeu5%;$Yk!Cuy!~)I9pE@gru2wUy()TcT?2->Zq*EC^L`l?{WLY|S0(6x>i9YeCvs ztUQlS>_$3KZqC0*?VqgVVoy6Hv06@@o!prADgCZZJZ7I%BE`N`Gv}m_&bu|>BmPet zPe%w2HwvY(Di_KOcXa`-i3uKRHUGRQ z>nJG^Er3{xwP6ee4X+O`jF6Q}C&y3+8$0OZgy9H@YpLh$CAjI{o+GbY)3KlDan%Ph zX*j6t?S~<~Es3D&?N3pjxucS_O55zl3~MWWE|qO|!WFcPi3+><+!y_?w#Q~!ZEWqKbBl(M}R|9D`6 zP$7`{*M^ZqJR`zSmKitf>&1rh$?ZZ2FHuLHW>Q_;TY}m(U%v6p>A5f?xQN}HtQj{_ z7}mZb1!`H$L^FhihS&~+U(?#(kPM!L$ttku(lkZim3Sp)h@1)K<*Z`5acWvaD2BCyN(xqo^J zEmAC>5>+;~W}|Qm+5w)MqCG|hy@0@7bm+O9deVt_FAvE+>ywO52|wCC#X4=ikiYnr z_nzsaxyH}gALiQup{)w(-&Cp;+~?Yez}D59C^q5Bj<&6Q{tyOu6W;eU`}`b;8S0*v zf(0i6&3?QhyjqDN7bA1tr1SvU!>G1=try#W zMKGD7hjbED^h)DI)OtwPPp`MQm$c+GKLc0)E4-@Bj1$oXGeJvkHA@N(H(@na{IQHp z&wv4QD}S%ENxH_v_k48%IcyOiS03-Yry+wAp-+sFgB-J?xQ**3Cg=15cI%2RwIIlEjJ=K?;>SwJL3CXj}5_UMV!tQ~FM6 zl33uLq{z{m_c3f^32ceoO&{=<7{gY>{S5&CD({)DFi=Wt?l-o-&&sn>NJXhY!PtsW zRftsdp^yNWAG5GVIeIjKB_ns0uv-!*V*f+G0~Lm%6mS+!#X76lOundzP~D;8o> znFnn50tpUXL0Aj=!?RdyqpKVai3TOp>+_w7!A?{_@GojGjWWw6G{ zJg6S=Ey`MDU7sWZFST~}PV?FoD}4B(6QO>~y{Ek@d6ApMlj0*&S82p8H)X`cUZ!rO z7xr4BM!)8R!i#fr?*FTk`hVOu_g9pL@Ac!72v=IW_*goySrDYIs-se;WE1f}jK951 From 1e7c398314d1225730ea997eeb7a543827857798 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Mon, 8 Jan 2024 13:12:44 +0000 Subject: [PATCH 28/48] updateto 1.19 --- modules/nf-core/multiqc/environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/nf-core/multiqc/environment.yml b/modules/nf-core/multiqc/environment.yml index bc0bdb5b..7625b752 100644 --- a/modules/nf-core/multiqc/environment.yml +++ b/modules/nf-core/multiqc/environment.yml @@ -4,4 +4,4 @@ channels: - bioconda - defaults dependencies: - - bioconda::multiqc=1.18 + - bioconda::multiqc=1.19 From 327a531a78dc1c0b19471523e44d221a5c23c640 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Mon, 8 Jan 2024 13:15:42 +0000 Subject: [PATCH 29/48] removed redundant scripts --- .../ampliconarchitect/ampliconarchitect.nf | 84 --- .../ampliconclassifier/ampliconclassifier.nf | 68 -- modules/local/ampliconclassifier/circdna.nf | 607 ------------------ modules/local/ampliconsuite/prepareaa.nf | 82 --- 4 files changed, 841 deletions(-) delete mode 100644 modules/local/ampliconarchitect/ampliconarchitect.nf delete mode 100644 modules/local/ampliconclassifier/ampliconclassifier.nf delete mode 100644 modules/local/ampliconclassifier/circdna.nf delete mode 100644 modules/local/ampliconsuite/prepareaa.nf diff --git a/modules/local/ampliconarchitect/ampliconarchitect.nf b/modules/local/ampliconarchitect/ampliconarchitect.nf deleted file mode 100644 index 3a124d63..00000000 --- a/modules/local/ampliconarchitect/ampliconarchitect.nf +++ /dev/null @@ -1,84 +0,0 @@ -process AMPLICONARCHITECT_AMPLICONARCHITECT { - tag "$meta.id" - label 'process_low' - - conda "bioconda::ampliconsuite=0.1555.2 mosek::mosek=10.1b1" - container 'quay.io/nf-core/prepareaa:1.0.0' - - input: - tuple val(meta), path(bam), path(bai), path(bed) - - output: - tuple val(meta), path("*cycles.txt") , optional: true, emit: cycles - tuple val(meta), path("*graph.txt") , optional: true, emit: graph - tuple val(meta), path("*cnseg.txt") , optional: true, emit: cnseg - tuple val(meta), path("*.out") , optional: true, emit: out - tuple val(meta), path("*.{pdf,png}") , optional: true, emit: svview - tuple val(meta), path("*_summary.txt") , optional: true, emit: summary - tuple val(meta), path("*{log.txt,flag.txt}") , emit: log - tuple val(meta), path("*sample_metadata.json") , emit: s_json - tuple val(meta), path("*run_metadata.json") , emit: r_json - path "versions.yml" , emit: versions - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - export AA_DATA_REPO=${params.aa_data_repo} - export MOSEKLM_LICENSE_FILE=${params.mosek_license_dir} - - if ! command -v AmpliconArchitect.py &> /dev/null; then - export AA_SRC=\$(dirname \$(python -c "import ampliconarchitectlib; print(ampliconarchitectlib.__file__)")) - else - export AA_SRC=\$(dirname \$(readlink -f \$(which AmpliconArchitect.py))) - fi - - if ! command -v amplicon_classifier.py &> /dev/null; then - export AC_SRC=\$(dirname \$(python -c "import ampliconclassifierlib; print(ampliconclassifierlib.__file__)")) - else - export AC_SRC=\$(dirname \$(readlink -f \$(which amplicon_classifier.py))) - fi - - REF=${params.reference_build} - - AmpliconSuite-pipeline.py \\ - -t $task.cpus \\ - --bam $bam \\ - --bed $bed \\ - --ref \$REF \\ - -s "${prefix}" \\ - --run_AA - $args - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - AmpliconSuite-pipeline.py: \$(echo \$(AmpliconSuite-pipeline.py --version) | sed 's/^.*PrepareAA version //') - END_VERSIONS - """ - - stub: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - export AA_DATA_REPO=${params.aa_data_repo} - export MOSEKLM_LICENSE_FILE=${params.mosek_license_dir} - export AA_SRC=${projectDir}/bin - REF=${params.reference_build} - - touch "${prefix}.logs.txt" - touch "${prefix}.cycles.txt" - touch "${prefix}.graph.txt" - touch "${prefix}.out" - touch "${prefix}_cnseg.txt" - touch "${prefix}.pdf" - touch "${prefix}.png" - touch "${prefix}_summary.txt" - - AmpliconArchitect.py --help - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - AmpliconSuite-pipeline.py: \$(echo \$(AmpliconSuite-pipeline.py --version) | sed 's/^.*PrepareAA version //') - END_VERSIONS - """ -} diff --git a/modules/local/ampliconclassifier/ampliconclassifier.nf b/modules/local/ampliconclassifier/ampliconclassifier.nf deleted file mode 100644 index 91749eae..00000000 --- a/modules/local/ampliconclassifier/ampliconclassifier.nf +++ /dev/null @@ -1,68 +0,0 @@ -process AMPLICONCLASSIFIER_AMPLICONCLASSIFIER { - tag "AA Amplicons" - label 'process_low' - - conda "bioconda::ampliconsuite=0.1555.2 mosek::mosek=10.1b1" - container 'quay.io/nf-core/prepareaa:1.0.0' - - input: - path (graphs) - path (cycles) - path (cnseg) - - output: - path ("*" ) , emit: all , optional: true - path ("versions.yml" ) , emit: versions - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "ampliconarchitect" - - """ - export AA_DATA_REPO=${params.aa_data_repo} - if ! command -v AmpliconArchitect.py &> /dev/null; then - export AA_SRC=\$(dirname \$(python -c "import ampliconarchitectlib; print(ampliconarchitectlib.__file__)")) - else - export AA_SRC=\$(dirname \$(readlink -f \$(which AmpliconArchitect.py))) - fi - - if ! command -v amplicon_classifier.py &> /dev/null; then - export AC_SRC=\$(dirname \$(python -c "import ampliconclassifierlib; print(ampliconclassifierlib.__file__)")) - else - export AC_SRC=\$(dirname \$(readlink -f \$(which amplicon_classifier.py))) - fi - - AmpliconSuite-pipeline.py \\ - -s $prefix \\ - --completed_AA_runs ./ \\ - -t $task.cpus \\ - --ref $params.reference_build - - mv ampliconarchitect_classification/* ./ - rmdir ampliconarchitect_classification - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - AmpliconSuite-pipeline.py: \$(AmpliconSuite-pipeline.py --version | sed 's/AmpliconSuite-pipeline version //') - END_VERSIONS - """ - - stub: - def args = task.ext.args ?: '' - """ - export AA_DATA_REPO=${params.aa_data_repo} - export MOSEKLM_LICENSE_FILE=${params.mosek_license_dir} - export AA_SRC=${projectDir}/bin - REF=${params.reference_build} - - touch "ampliconclassifier_amplicon_classification_profiles.tsv" - touch "ampliconclassifier_classifier_stdout.log" - - amplicon_classifier.py --help - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - AmpliconSuite-pipeline.py: \$(AmpliconSuite-pipeline.py --version | sed 's/AmpliconSuite-pipeline version //') - END_VERSIONS - """ -} diff --git a/modules/local/ampliconclassifier/circdna.nf b/modules/local/ampliconclassifier/circdna.nf deleted file mode 100644 index 34e03ff8..00000000 --- a/modules/local/ampliconclassifier/circdna.nf +++ /dev/null @@ -1,607 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - VALIDATE INPUTS -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - - -def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) - -// Validate input parameters -WorkflowCircdna.initialise(params, log) - -// Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.multiqc_config, params.fasta ] -for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } - -// Check mandatory parameters -if (params.input) { ch_input = Channel.fromPath(params.input) } else { exit 1, 'Input samplesheet not specified!' } -if (params.fasta) { ch_fasta = Channel.fromPath(params.fasta) } else { exit 1, 'Fasta reference genome not specified!' } - -if (!(params.input_format == "FASTQ" | params.input_format == "BAM")) { - exit 1, 'Please specifiy --input_format "FASTQ" or "BAM" in capital letters, depending on the input file format.' -} - -// Modify fasta channel to include meta data -ch_fasta_meta = ch_fasta.map{ it -> [[id:it[0].baseName], it] }.collect() - -branch = params.circle_identifier.split(",") -run_circexplorer2 = ("circexplorer2" in branch) -run_circle_map_realign = ("circle_map_realign" in branch) -run_circle_map_repeats = ("circle_map_repeats" in branch) -run_circle_finder = ("circle_finder" in branch) -run_ampliconarchitect = ("ampliconarchitect" in branch) -run_unicycler = ("unicycler" in branch) - -if (!(run_unicycler | run_circle_map_realign | run_circle_map_repeats | run_circle_finder | run_ampliconarchitect | run_circexplorer2)) { - exit 1, 'circle_identifier param not valid. Please check!' -} - -if (run_unicycler && !params.input_format == "FASTQ") { - exit 1, 'Unicycler needs FastQ input. Please specify input_format == "FASTQ", if possible, or don`t run unicycler.' -} - -if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } - -// Check if BWA Index is given -if (params.bwa_index) { - ch_bwa_index = Channel.fromPath(params.bwa_index).collect() - bwa_index_exists = true - } else { - ch_bwa_index = Channel.empty() - bwa_index_exists = false - } - -// AMPLICON ARCHITECT INPUT -if (run_ampliconarchitect) { - mosek_license_dir = file(params.mosek_license_dir) - if (!mosek_license_dir.exists()) { - exit 1, "Mosek License Directory is missing! Please specifiy directory containing mosek license using --mosek_license_dir and rename license to 'mosek.lic'." - } - if (!params.aa_data_repo) { exit 1, "AmpliconArchitect Data Repository Missing! Please see https://github.com/jluebeck/AmpliconArchitect for more information and specify its absolute path using --aa_data_repo." } - if (params.reference_build != "hg19" & params.reference_build != "GRCh38" & params.reference_build != "GRCh37" & params.reference_build != "mm10"){ - exit 1, "Reference Build not given! Please specify --reference_build 'mm10', 'hg19', 'GRCh38', or 'GRCh37'." - } - - if (!params.cnvkit_cnn) { - ch_cnvkit_reference = file(params.aa_data_repo + "/" + params.reference_build + "/" + params.reference_build + "_cnvkit_filtered_ref.cnn", checkIfExists: true) - } else { - ch_cnvkit_reference = file(params.cnvkit_cnn) - } -} - - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - CONFIG FILES -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) -ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config, checkIfExists: true ) : Channel.empty() -ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() -ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - IMPORT LOCAL MODULES/SUBWORKFLOWS -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -// -// SUBWORKFLOW: Consisting of a mix of local and nf-core/modules -// -include { INPUT_CHECK } from '../subworkflows/local/input_check' - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - IMPORT NF-CORE MODULES/SUBWORKFLOWS & LOCAL MODULES/SUBWORKFLOWS -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' - - -// CONCATENATE FASTQ -include { CAT_FASTQ } from '../modules/nf-core/cat/fastq/main' - -// QUALITY CONTROL -include { FASTQC } from '../modules/nf-core/fastqc/main' - -// TRIMMING -include { TRIMGALORE } from '../modules/nf-core/trimgalore/main' - -// Genome Preparation -include { BWA_INDEX } from '../modules/nf-core/bwa/index/main' - -// Alignment -include { BWA_MEM } from '../modules/local/bwa/mem/main' -include { SAMTOOLS_SORT as SAMTOOLS_SORT_BAM } from '../modules/nf-core/samtools/sort/main' -include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_BAM } from '../modules/nf-core/samtools/index/main' - -// PICARD -include { SAMTOOLS_FAIDX } from '../modules/nf-core/samtools/faidx/main' -include { BAM_MARKDUPLICATES_PICARD } from '../subworkflows/nf-core/bam_markduplicates_picard/main' -include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_FILTER } from '../modules/nf-core/samtools/view/main' -include { SAMTOOLS_SORT as SAMTOOLS_SORT_FILTERED } from '../modules/nf-core/samtools/sort/main' -include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_FILTERED } from '../modules/nf-core/samtools/index/main' - -// BAM STATS -include { BAM_STATS_SAMTOOLS } from '../subworkflows/nf-core/bam_stats_samtools/main' - -// CIRCLE-MAP -include { CIRCLEMAP_READEXTRACTOR } from '../modules/local/circlemap/readextractor.nf' -include { SAMTOOLS_SORT as SAMTOOLS_SORT_RE } from '../modules/nf-core/samtools/sort/main' -include { SAMTOOLS_INDEX as SAMTOOLS_INDEX_RE } from '../modules/nf-core/samtools/index/main' -include { SAMTOOLS_SORT as SAMTOOLS_SORT_QNAME_CM } from '../modules/nf-core/samtools/sort/main' -include { CIRCLEMAP_REALIGN } from '../modules/local/circlemap/realign.nf' -include { CIRCLEMAP_REPEATS } from '../modules/local/circlemap/repeats.nf' - -// CIRCLE_FINDER -include { SAMTOOLS_SORT as SAMTOOLS_SORT_QNAME_CF } from '../modules/nf-core/samtools/sort/main' -include { SAMBLASTER } from '../modules/local/samblaster.nf' -include { BEDTOOLS_SORTEDBAM2BED } from '../modules/local/bedtools/sortedbam2bed.nf' -include { BEDTOOLS_SPLITBAM2BED } from '../modules/local/bedtools/splitbam2bed.nf' -include { CIRCLEFINDER } from '../modules/local/circlefinder.nf' - -// CIRCexplorer2 -include { CIRCEXPLORER2_PARSE } from '../modules/local/circexplorer2/parse.nf' - -// AmpliconArchitect -include { CNVKIT_BATCH } from '../modules/local/cnvkit/batch/main.nf' -include { CNVKIT_SEGMENT } from '../modules/local/cnvkit/segment.nf' -include { PREPAREAA } from '../modules/local/ampliconsuite/prepareaa.nf' -include { COLLECT_SEEDS } from '../modules/local/collect_seeds.nf' -include { AMPLIFIED_INTERVALS } from '../modules/local/amplified_intervals.nf' -include { AMPLICONARCHITECT_AMPLICONARCHITECT } from '../modules/local/ampliconarchitect/ampliconarchitect.nf' -include { AMPLICONCLASSIFIER_AMPLICONCLASSIFIER } from '../modules/local/ampliconclassifier/ampliconclassifier.nf' - -// Unicycler -include { UNICYCLER } from '../modules/local/unicycler/main.nf' -include { SEQTK_SEQ } from '../modules/local/seqtk/seq.nf' -include { GETCIRCULARREADS } from '../modules/local/getcircularreads.nf' -include { MINIMAP2_ALIGN } from '../modules/nf-core/minimap2/align/main.nf' - - -// MULTIQC -include { MULTIQC } from '../modules/local/multiqc.nf' - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - RUN MAIN WORKFLOW -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -// Info required for completion email and summary -def multiqc_report = [] - -workflow CIRCDNA { - ch_versions = Channel.empty() - - // Define Empty Channels for MultiQC - ch_samtools_stats = Channel.empty() - ch_samtools_flagstat = Channel.empty() - ch_samtools_idxstats = Channel.empty() - ch_markduplicates_stats = Channel.empty() - ch_markduplicates_flagstat = Channel.empty() - ch_markduplicates_idxstats = Channel.empty() - ch_markduplicates_multiqc = Channel.empty() - - // Check file format - if (params.input_format == "FASTQ") { - // - // SUBWORKFLOW: Read in samplesheet, validate and stage input files - // - INPUT_CHECK ( - ch_input - ) - .reads - .map { - meta, fastq -> - meta.id = meta.id.split('_')[0..-2].join('_') - [ meta, fastq ] } - .groupTuple(by: [0]) - .branch { - meta, fastq -> - single : fastq.size() == 1 - return [ meta, fastq.flatten() ] - multiple: fastq.size() > 1 - return [ meta, fastq.flatten() ] - } - .set { ch_fastq } - ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) - - // - // MODULE: Concatenate FASTQs from the same samples - // - CAT_FASTQ ( - ch_fastq.multiple - ) - .reads - .mix(ch_fastq.single) - .set { ch_cat_fastq } - - ch_versions = ch_versions.mix(CAT_FASTQ.out.versions) - - - // - // MODULE: Run FastQC - // - ch_fastqc_multiqc = Channel.empty() - if ( ! params.skip_qc ) { - FASTQC ( - ch_cat_fastq - ) - ch_versions = ch_versions.mix(FASTQC.out.versions) - ch_fastqc_multiqc = FASTQC.out.zip - } - - // - // MODULE: Run trimgalore - // - if ( ! params.skip_trimming ) { - TRIMGALORE ( - ch_cat_fastq - ) - ch_trimmed_reads = TRIMGALORE.out.reads - ch_trimgalore_multiqc = TRIMGALORE.out.zip - ch_trimgalore_multiqc_log = TRIMGALORE.out.log - ch_versions = ch_versions.mix(TRIMGALORE.out.versions) - } else { - ch_trimmed_reads = INPUT_CHECK.out.reads - ch_trimgalore_multiqc = Channel.empty() - ch_trimgalore_multiqc_log = Channel.empty() - } - - // - // MODULE: Run bwa index - // - if (!bwa_index_exists & (run_ampliconarchitect | run_circexplorer2 | - run_circle_finder | run_circle_map_realign | - run_circle_map_repeats)) { - BWA_INDEX ( - ch_fasta_meta - ) - ch_bwa_index = BWA_INDEX.out.index.map{ meta, index -> ["bwa_index", index] }.collect() - ch_versions = ch_versions.mix(BWA_INDEX.out.versions) - } - - - // - // MODULE: BWA MEM ALIGNMENT - // - if (run_ampliconarchitect | run_circexplorer2 | run_circle_finder | - run_circle_map_realign | run_circle_map_repeats) { - BWA_MEM ( - ch_trimmed_reads, - ch_bwa_index, - Channel.value(true) - ) - ch_bam_sorted = BWA_MEM.out.bam - ch_full_bam_sorted = BWA_MEM.out.bam - ch_bwa_sorted = BWA_MEM.out.bam - ch_versions = ch_versions.mix(BWA_MEM.out.versions) - - // SAMTOOLS INDEX SORTED BAM - SAMTOOLS_INDEX_BAM ( - ch_bam_sorted - ) - ch_versions = ch_versions.mix(SAMTOOLS_INDEX_BAM.out.versions) - } - } else if (params.input_format == "BAM") { - // Use BAM Files as input - INPUT_CHECK ( - ch_input - ) - if (!params.bam_sorted){ - SAMTOOLS_SORT_BAM ( - INPUT_CHECK.out.reads - ) - ch_versions = ch_versions.mix(SAMTOOLS_SORT_BAM.out.versions) - ch_bam_sorted = SAMTOOLS_SORT_BAM.out.bam - } else { - ch_bam_sorted = INPUT_CHECK.out.reads - ch_full_bam_sorted = INPUT_CHECK.out.reads - ch_bwa_sorted = INPUT_CHECK.out.reads - } - // SAMTOOLS INDEX SORTED BAM - SAMTOOLS_INDEX_BAM ( - ch_bam_sorted - ) - ch_versions = ch_versions.mix(SAMTOOLS_INDEX_BAM.out.versions) - ch_fastqc_multiqc = Channel.empty() - ch_trimgalore_multiqc = Channel.empty() - ch_trimgalore_multiqc_log = Channel.empty() - } - - - - - if (run_ampliconarchitect | run_circexplorer2 | run_circle_finder | - run_circle_map_realign | run_circle_map_repeats) { - - // Define Index channel and additional bam sorted channels for Circle_finder - not usable with duplicates removed - ch_bam_sorted_bai = SAMTOOLS_INDEX_BAM.out.bai - ch_full_bam_sorted = ch_bam_sorted - ch_full_bam_sorted_bai = SAMTOOLS_INDEX_BAM.out.bai - - ch_fasta = ch_fasta_meta.map{ meta, index -> [index] }.collect() - - // Stub run is not yet implemented into BAM_STATS_SAMTOOLS subworkflow -> Will be skipped when stub is active - if (!workflow.stubRun) { - BAM_STATS_SAMTOOLS ( - ch_bam_sorted.join(ch_bam_sorted_bai). - map { meta, bam, bai -> [meta, bam, bai] }, - ch_fasta_meta - ) - ch_versions = ch_versions.mix(BAM_STATS_SAMTOOLS.out.versions) - ch_samtools_stats = BAM_STATS_SAMTOOLS.out.stats - ch_samtools_flagstat = BAM_STATS_SAMTOOLS.out.flagstat - ch_samtools_idxstats = BAM_STATS_SAMTOOLS.out.idxstats - } - - // PICARD MARK_DUPLICATES - if (!params.skip_markduplicates) { - // Index Fasta File for Markduplicates - SAMTOOLS_FAIDX ( - ch_fasta_meta, - [[], []] - ) - - // MARK DUPLICATES IN BAM FILE - BAM_MARKDUPLICATES_PICARD ( - ch_bam_sorted, - ch_fasta_meta, - SAMTOOLS_FAIDX.out.fai.collect() - ) - - // FILTER DUPLICATES IN BAM FILES USING SAMTOOLS VIEW - if (!params.keep_duplicates) { - SAMTOOLS_VIEW_FILTER ( - ch_bam_sorted.join(ch_bam_sorted_bai), - ch_fasta_meta, - [] - ) - ch_versions = ch_versions.mix(SAMTOOLS_VIEW_FILTER.out.versions) - - // SORT FILTERED BAM FILE - SAMTOOLS_SORT_FILTERED ( - SAMTOOLS_VIEW_FILTER.out.bam - ) - ch_versions = ch_versions.mix(SAMTOOLS_SORT_FILTERED.out.versions) - - // INDEX FILTERED BAM FILE - SAMTOOLS_INDEX_FILTERED ( - SAMTOOLS_SORT_FILTERED.out.bam - ) - - ch_bam_sorted = SAMTOOLS_SORT_FILTERED.out.bam - ch_bam_sorted_bai = SAMTOOLS_INDEX_FILTERED.out.bai - ch_versions = ch_versions.mix(SAMTOOLS_INDEX_FILTERED.out.versions) - } - else { - ch_bam_sorted = BAM_MARKDUPLICATES_PICARD.out.bam - ch_bam_sorted_bai = BAM_MARKDUPLICATES_PICARD.out.bai - ch_markduplicates_stats = BAM_MARKDUPLICATES_PICARD.out.stats - ch_markduplicates_flagstat = BAM_MARKDUPLICATES_PICARD.out.flagstat - ch_markduplicates_idxstats = BAM_MARKDUPLICATES_PICARD.out.idxstats - ch_markduplicates_multiqc = BAM_MARKDUPLICATES_PICARD.out.metrics - ch_versions = ch_versions.mix(BAM_MARKDUPLICATES_PICARD.out.versions) - } - } else { - ch_markduplicates_stats = Channel.empty() - ch_markduplicates_flagstat = Channel.empty() - ch_markduplicates_idxstats = Channel.empty() - ch_markduplicates_multiqc = Channel.empty() - } - } - - if (run_ampliconarchitect) { - PREPAREAA ( - ch_bam_sorted - ) - ch_versions = ch_versions.mix(PREPAREAA.out.versions) - - AMPLICONARCHITECT_AMPLICONARCHITECT ( - ch_bam_sorted.join(ch_bam_sorted_bai). - join(PREPAREAA.out.bed) - ) - ch_versions = ch_versions.mix(AMPLICONARCHITECT_AMPLICONARCHITECT.out.versions) - - ch_aa_cycles = AMPLICONARCHITECT_AMPLICONARCHITECT.out.cycles. - map {meta, path -> [path]} - ch_aa_graphs = AMPLICONARCHITECT_AMPLICONARCHITECT.out.graph. - map {meta, path -> [path]} - ch_aa_cnseg = AMPLICONARCHITECT_AMPLICONARCHITECT.out.cnseg. - map {meta, path -> [path]} - - AMPLICONCLASSIFIER_AMPLICONCLASSIFIER ( - ch_aa_graphs.flatten().collect().ifEmpty([]), - ch_aa_cycles.flatten().collect().ifEmpty([]), - ch_aa_cnseg.flatten().collect().ifEmpty([]) - ) - ch_versions = ch_versions.mix(AMPLICONCLASSIFIER_AMPLICONCLASSIFIER.out.versions) - } - - - // - // SUBWORKFLOW - RUN CIRCLE_FINDER PIPELINE - // - if (run_circle_finder) { - SAMTOOLS_SORT_QNAME_CF ( - ch_full_bam_sorted - ) - ch_versions = ch_versions.mix(SAMTOOLS_SORT_QNAME_CF.out.versions) - - SAMBLASTER ( - SAMTOOLS_SORT_QNAME_CF.out.bam - ) - ch_versions = ch_versions.mix(SAMBLASTER.out.versions) - - BEDTOOLS_SPLITBAM2BED ( - SAMBLASTER.out.split_bam - ) - ch_versions = ch_versions.mix(BEDTOOLS_SPLITBAM2BED.out.versions) - - BEDTOOLS_SORTEDBAM2BED ( - ch_full_bam_sorted.join(ch_full_bam_sorted_bai) - ) - ch_versions = ch_versions.mix(BEDTOOLS_SORTEDBAM2BED.out.versions) - - ch_b2b_sorted = BEDTOOLS_SORTEDBAM2BED.out.conc_txt - ch_b2b_split = BEDTOOLS_SPLITBAM2BED.out.split_txt - CIRCLEFINDER ( - ch_b2b_split.join(ch_b2b_sorted) - ) - } - - // - // SUBWORKFLOW: RUN CIRCLE-MAP REALIGN or REPEATS PIPELINE - // - if (run_circle_map_realign || - run_circle_map_repeats) { - SAMTOOLS_SORT_QNAME_CM ( - ch_bam_sorted - ) - ch_versions = ch_versions.mix(SAMTOOLS_SORT_QNAME_CM.out.versions) - - CIRCLEMAP_READEXTRACTOR ( - SAMTOOLS_SORT_QNAME_CM.out.bam - ) - ch_versions = ch_versions.mix(CIRCLEMAP_READEXTRACTOR.out.versions) - - SAMTOOLS_SORT_RE ( - CIRCLEMAP_READEXTRACTOR.out.bam - ) - ch_versions = ch_versions.mix(SAMTOOLS_SORT_RE.out.versions) - - SAMTOOLS_INDEX_RE ( - SAMTOOLS_SORT_RE.out.bam - ) - ch_versions = ch_versions.mix(SAMTOOLS_INDEX_RE.out.versions) - - // DEFINE CHANNELS FOR REALIGN AND REPEATS - ch_qname_sorted_bam = SAMTOOLS_SORT_QNAME_CM.out.bam - ch_re_sorted_bam = SAMTOOLS_SORT_RE.out.bam - ch_re_sorted_bai = SAMTOOLS_INDEX_RE.out.bai - - // - // MODULE: RUN CIRCLE_MAP REPEATS - // - if (run_circle_map_repeats) { - CIRCLEMAP_REPEATS ( - ch_re_sorted_bam.join(ch_re_sorted_bai) - ) - ch_versions = ch_versions.mix(CIRCLEMAP_REPEATS.out.versions) - } - - // - // MODULE: Run Circle-Map Realign - // - if (run_circle_map_realign) { - - CIRCLEMAP_REALIGN ( - ch_re_sorted_bam.join(ch_re_sorted_bai). - join(ch_qname_sorted_bam). - join(ch_bam_sorted). - join(ch_bam_sorted_bai), - ch_fasta - ) - ch_versions = ch_versions.mix(CIRCLEMAP_REALIGN.out.versions) - } - } - - - if (run_circexplorer2) { - CIRCEXPLORER2_PARSE ( - ch_bam_sorted.join(ch_bam_sorted_bai) - ) - ch_versions = ch_versions.mix(CIRCEXPLORER2_PARSE.out.versions) - } - - if (run_unicycler && params.input_format == "FASTQ") { - - UNICYCLER ( - ch_trimmed_reads - ) - ch_versions = ch_versions.mix(UNICYCLER.out.versions) - - SEQTK_SEQ ( - UNICYCLER.out.scaffolds - ) - ch_versions = ch_versions.mix(SEQTK_SEQ.out.versions) - - GETCIRCULARREADS ( - SEQTK_SEQ.out.fastq - ) - - GETCIRCULARREADS.out.fastq - .map { meta, fastq -> [ meta + [single_end: true], fastq ] } - .set { ch_circular_fastq } - - MINIMAP2_ALIGN ( - ch_circular_fastq, - ch_fasta, - false, - false, - false - ) - ch_versions = ch_versions.mix(MINIMAP2_ALIGN.out.versions) - } - - // - // MODULE: Pipeline reporting - // - CUSTOM_DUMPSOFTWAREVERSIONS ( - ch_versions.unique().collectFile(name: 'collated_versions.yml') - ) - - // - // MODULE: MultiQC - // - if (!params.skip_multiqc) { - workflow_summary = WorkflowCircdna.paramsSummaryMultiqc(workflow, summary_params) - ch_workflow_summary = Channel.value(workflow_summary) - - MULTIQC ( - ch_multiqc_config, - ch_multiqc_custom_config.collect().ifEmpty([]), - CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect(), - ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml'), - ch_fastqc_multiqc.collect{it[1]}.ifEmpty([]), - ch_trimgalore_multiqc.collect{it[1]}.ifEmpty([]), - ch_trimgalore_multiqc_log.collect{it[1]}.ifEmpty([]), - ch_samtools_stats.collect{it[1]}.ifEmpty([]), - ch_samtools_flagstat.collect{it[1]}.ifEmpty([]), - ch_samtools_idxstats.collect{it[1]}.ifEmpty([]), - ch_markduplicates_flagstat.collect{it[1]}.ifEmpty([]), - ch_markduplicates_stats.collect{it[1]}.ifEmpty([]), - ch_markduplicates_idxstats.collect{it[1]}.ifEmpty([]), - ch_markduplicates_multiqc.collect{it[1]}.ifEmpty([]), - ) - multiqc_report = MULTIQC.out.report.toList() - } -} - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - COMPLETION EMAIL AND SUMMARY -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - - -workflow.onComplete { - if (params.email || params.email_on_fail) { - NfcoreTemplate.email(workflow, params, summary_params, projectDir, log, multiqc_report) - } - NfcoreTemplate.summary(workflow, params, log) - if (params.hook_url) { - NfcoreTemplate.IM_notification(workflow, params, summary_params, projectDir, log) - } -} - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - THE END -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ diff --git a/modules/local/ampliconsuite/prepareaa.nf b/modules/local/ampliconsuite/prepareaa.nf deleted file mode 100644 index 9f6519c4..00000000 --- a/modules/local/ampliconsuite/prepareaa.nf +++ /dev/null @@ -1,82 +0,0 @@ -process PREPAREAA { - tag "$meta.id" - label 'process_low' - - conda "bioconda::ampliconsuite=0.1555.2 mosek::mosek=10.1b1" - container 'quay.io/nf-core/prepareaa:1.0.0' - - input: - tuple val(meta), path(bam) - - output: - tuple val(meta), path("*CNV_SEEDS.bed") , emit: bed - path "*.log" , emit: log - path "*run_metadata.json" , emit: run_metadata_json - path "*sample_metadata.json" , emit: sample_metadata_json - path "*timing_log.txt" , emit: timing_log - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def cngain = params.aa_cngain - def ref = params.reference_build - """ - export AA_DATA_REPO=${params.aa_data_repo} - export MOSEKLM_LICENSE_FILE=${params.mosek_license_dir} - # Define Variables AA_SRC and AC_SRC - if ! command -v AmpliconArchitect.py &> /dev/null; then - export AA_SRC=\$(dirname \$(python -c "import ampliconarchitectlib; print(ampliconarchitectlib.__file__)")) - else - export AA_SRC=\$(dirname \$(readlink -f \$(which AmpliconArchitect.py))) - fi - - if ! command -v amplicon_classifier.py &> /dev/null; then - export AC_SRC=\$(dirname \$(python -c "import ampliconclassifierlib; print(ampliconclassifierlib.__file__)")) - else - export AC_SRC=\$(dirname \$(readlink -f \$(which amplicon_classifier.py))) - fi - REF=${params.reference_build} - - AmpliconSuite-pipeline.py \\ - $args \\ - -s $prefix \\ - -t $task.cpus \\ - --bam $bam \\ - --ref $ref \\ - $args - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - AmpliconSuite-pipeline.py: \$(AmpliconSuite-pipeline.py --version | sed 's/AmpliconSuite-pipeline version //') - END_VERSIONS - """ - - stub: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def cngain = params.aa_cngain - def ref = params.reference_build - """ - export AA_DATA_REPO=${params.aa_data_repo} - export MOSEKLM_LICENSE_FILE=${params.mosek_license_dir} - REF=${params.reference_build} - - touch "${prefix}_CNV_SEEDS.bed" - touch "${prefix}.log" - touch "${prefix}.run_metadata.json" - touch "${prefix}.sample_metadata.json" - touch "${prefix}.timing_log.txt" - touch "${prefix}_summary.txt" - - PrepareAA.py --help - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - AmpliconSuite-pipeline.py: \$(AmpliconSuite-pipeline.py --version | sed 's/AmpliconSuite-pipeline version //') - END_VERSIONS - """ -} From d64f0c7e5cf2d9417d3433d611ba7167b0ba418c Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Mon, 8 Jan 2024 13:17:06 +0000 Subject: [PATCH 30/48] removed redundant scripts --- modules/local/amplified_intervals.nf | 61 ---------------------------- modules/local/collect_seeds.nf | 52 ------------------------ 2 files changed, 113 deletions(-) delete mode 100644 modules/local/amplified_intervals.nf delete mode 100644 modules/local/collect_seeds.nf diff --git a/modules/local/amplified_intervals.nf b/modules/local/amplified_intervals.nf deleted file mode 100644 index 56da6cbd..00000000 --- a/modules/local/amplified_intervals.nf +++ /dev/null @@ -1,61 +0,0 @@ -process AMPLIFIED_INTERVALS { - tag "$meta.id" - label 'process_low' - - conda "conda-forge::python=2.7 bioconda::pysam=0.15.2 anaconda::flask=1.1.2 anaconda::cython=0.29.14 anaconda::numpy=1.16.6 anaconda::scipy=1.2.1 conda-forge::matplotlib=2.2.5 mosek::mosek=8.0.60 anaconda::future=0.18.2" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-6eefa51f13933d65b4f8155ca2f8cd81dea474ba:baa777f7c4e89a2ec4d1eab7d424a1f46503bac7-0': - 'quay.io/biocontainers/mulled-v2-6eefa51f13933d65b4f8155ca2f8cd81dea474ba:baa777f7c4e89a2ec4d1eab7d424a1f46503bac7-0' }" - - input: - tuple val(meta), path(bed), path(bam), path(bai) - - output: - tuple val(meta), path("*CNV_SEEDS.bed"), emit: bed - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def cngain = params.aa_cngain - def ref = params.reference_build - """ - export AA_DATA_REPO=${params.aa_data_repo} - export MOSEKLM_LICENSE_FILE=${params.mosek_license_dir} - REF=${params.reference_build} - - amplified_intervals.py \\ - $args \\ - --bed $bed \\ - --out ${prefix}_AA_CNV_SEEDS \\ - --bam $bam \\ - --gain $cngain \\ - --ref $ref - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: echo \$(python --version 2<&1 | sed 's/Python //g') - END_VERSIONS - """ - - stub: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def cngain = params.aa_cngain - def ref = params.reference_build - """ - export AA_DATA_REPO=${params.aa_data_repo} - export MOSEKLM_LICENSE_FILE=${params.mosek_license_dir} - REF=${params.reference_build} - - touch ${prefix}_AA_CNV_SEEDS.bed - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: echo \$(python --version 2<&1 | sed 's/Python //g') - END_VERSIONS - """ -} diff --git a/modules/local/collect_seeds.nf b/modules/local/collect_seeds.nf deleted file mode 100644 index 7654659a..00000000 --- a/modules/local/collect_seeds.nf +++ /dev/null @@ -1,52 +0,0 @@ -process COLLECT_SEEDS { - tag "$meta.id" - label 'process_low' - - conda "conda-forge::python=3.9.5" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.9--1' : - 'quay.io/biocontainers/python:3.9--1' }" - - input: - tuple val(meta), path(cns) - - output: - tuple val(meta), path("*.bed"), emit: bed - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def cngain = params.aa_cngain - """ - collect_seeds.py \\ - --sample $prefix \\ - --cns $cns \\ - --cngain $cngain - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - END_VERSIONS - """ - - stub: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def cngain = params.aa_cngain - """ - export AA_DATA_REPO=${params.aa_data_repo} - export MOSEKLM_LICENSE_FILE=${params.mosek_license_dir} - REF=${params.reference_build} - - touch ${prefix}.bed - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - END_VERSIONS - """ -} From f85d6b23b9073016e5ea8ee8f921330e7ee2ee07 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Mon, 8 Jan 2024 14:28:35 +0000 Subject: [PATCH 31/48] added output directories and output for ampliconsuite --- conf/modules.config | 16 ++++- modules.json | 76 +++++++++++++++----- modules/local/ampliconsuite/ampliconsuite.nf | 33 +++++---- 3 files changed, 92 insertions(+), 33 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index ee5b6248..bf49190a 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -345,9 +345,21 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ], [ - path: { "${params.outdir}/ampliconarchitect/cnseg" }, + path: { "${params.outdir}/ampliconsuite/ampliconarchitect/intermediate" }, mode: params.publish_dir_mode, - pattern: '*cnseg.txt', + pattern: '*{cnseg.txt,edges_cnseg.txt,.out}', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ], + [ + path: { "${params.outdir}/ampliconsuite/ampliconarchitect/sv_view" }, + mode: params.publish_dir_mode, + pattern: '*{.png,.pdf}', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ], + [ + path: { "${params.outdir}/ampliconsuite/ampliconclassifier/input" }, + mode: params.publish_dir_mode, + pattern: '*.input', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ], ] diff --git a/modules.json b/modules.json index 8f3be88a..7083d941 100644 --- a/modules.json +++ b/modules.json @@ -8,77 +8,112 @@ "bwa/index": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "cat/fastq": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "custom/dumpsoftwareversions": { "branch": "master", "git_sha": "37dee863936732fe7e05dc598bf6e183a8e7ef73", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "fastqc": { "branch": "master", "git_sha": "617777a807a1770f73deb38c80004bac06807eef", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "minimap2/align": { "branch": "master", "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "multiqc": { "branch": "master", "git_sha": "642a0d8afe373ac45244a7947fb8a6c0a5a312d4", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "picard/markduplicates": { "branch": "master", "git_sha": "20b0918591d4ba20047d7e13e5094bcceba81447", - "installed_by": ["bam_markduplicates_picard", "modules"] + "installed_by": [ + "bam_markduplicates_picard", + "modules" + ] }, "samtools/faidx": { "branch": "master", "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/flagstat": { "branch": "master", "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", - "installed_by": ["bam_stats_samtools", "modules"] + "installed_by": [ + "bam_stats_samtools", + "modules" + ] }, "samtools/idxstats": { "branch": "master", "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", - "installed_by": ["bam_stats_samtools", "modules"] + "installed_by": [ + "bam_stats_samtools", + "modules" + ] }, "samtools/index": { "branch": "master", "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", - "installed_by": ["bam_markduplicates_picard", "modules"] + "installed_by": [ + "bam_markduplicates_picard", + "modules" + ] }, "samtools/sort": { "branch": "master", "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "samtools/stats": { "branch": "master", "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", - "installed_by": ["bam_stats_samtools", "modules"] + "installed_by": [ + "bam_stats_samtools", + "modules" + ] }, "samtools/view": { "branch": "master", "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] }, "trimgalore": { "branch": "master", "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] } } }, @@ -87,15 +122,20 @@ "bam_markduplicates_picard": { "branch": "master", "git_sha": "eeb9d37c6c8b0ab864b8fe68aa6531c5b2beba01", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "bam_stats_samtools": { "branch": "master", "git_sha": "a64788f5ad388f1d2ac5bd5f1f3f8fc81476148c", - "installed_by": ["bam_markduplicates_picard", "subworkflows"] + "installed_by": [ + "bam_markduplicates_picard", + "subworkflows" + ] } } } } } -} +} \ No newline at end of file diff --git a/modules/local/ampliconsuite/ampliconsuite.nf b/modules/local/ampliconsuite/ampliconsuite.nf index 882aecfb..64345b69 100644 --- a/modules/local/ampliconsuite/ampliconsuite.nf +++ b/modules/local/ampliconsuite/ampliconsuite.nf @@ -9,19 +9,26 @@ process AMPLICONSUITE { tuple val(meta), path(bam) output: - tuple val(meta), path("*CNV_SEEDS.bed") , emit: bed - path "*.log" , emit: log - path "*run_metadata.json" , emit: run_metadata_json - path "*sample_metadata.json" , emit: sample_metadata_json - path "*timing_log.txt" , emit: timing_log - path "*cycles.txt" , emit: cycles, optional: true - path "*graph.txt" , emit: graph, optional: true - path "*edges.txt" , emit: edges, optional: true - path "*edges_cnseg.txt" , emit: edges_cnseg, optional: true - path "*.out" , emit: aa_out, optional: true - path "versions.yml" , emit: versions - path "*.png" , optional: true - path "*.pdf" , optional: true + path "*.bed" , emit: bed + path "*.cns" , emit: cns + path "*.cnr.gz" , emit: cnr + path "*.log" , emit: log + path "*run_metadata.json" , emit: run_metadata_json + path "*sample_metadata.json" , emit: sample_metadata_json + path "*timing_log.txt" , emit: timing_log + path "*.input" , emit: ac_input, optional: true + path "*logs.txt" , emit: logs, optional: true + path "*cycles.txt" , emit: cycles, optional: true + path "*graph.txt" , emit: graph, optional: true + path "*summary.txt" , emit: summary, optional: true + path "*summary_map.txt" , emit: summary_map, optional: true + path "*edges.txt" , emit: edges, optional: true + path "*edges_cnseg.txt" , emit: edges_cnseg, optional: true + path "*.out" , emit: aa_out, optional: true + path "*.png" , emit: png, optional: true + path "*.pdf" , emit: pdf, optional: true + path "*finish_flag.txt" , emit: finish_flag, optional: true + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when From 2fcd72e7df56234f87329a95e41dc52447bd3c56 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Mon, 8 Jan 2024 14:28:46 +0000 Subject: [PATCH 32/48] updated modules and multiqc --- modules/local/multiqc.nf | 6 +- modules/{nf-core => local}/multiqc/main.nf | 37 +++++---- modules/local/summarise_aa.nf | 47 ----------- modules/nf-core/multiqc/environment.yml | 7 -- modules/nf-core/multiqc/meta.yml | 58 ------------- modules/nf-core/multiqc/tests/main.nf.test | 83 ------------------- .../nf-core/multiqc/tests/main.nf.test.snap | 21 ----- modules/nf-core/multiqc/tests/tags.yml | 2 - 8 files changed, 24 insertions(+), 237 deletions(-) rename modules/{nf-core => local}/multiqc/main.nf (57%) delete mode 100644 modules/local/summarise_aa.nf delete mode 100644 modules/nf-core/multiqc/environment.yml delete mode 100644 modules/nf-core/multiqc/meta.yml delete mode 100644 modules/nf-core/multiqc/tests/main.nf.test delete mode 100644 modules/nf-core/multiqc/tests/main.nf.test.snap delete mode 100644 modules/nf-core/multiqc/tests/tags.yml diff --git a/modules/local/multiqc.nf b/modules/local/multiqc.nf index 024968d9..0af5e8ad 100644 --- a/modules/local/multiqc.nf +++ b/modules/local/multiqc.nf @@ -1,10 +1,10 @@ process MULTIQC { label 'process_medium' - conda 'bioconda::multiqc=1.13a' + conda 'bioconda::multiqc=1.19' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.13a--pyhdfd78af_1' : - 'quay.io/biocontainers/multiqc:1.13a--pyhdfd78af_1' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.18--pyhdfd78af_0' : + 'biocontainers/multiqc:1.19--pyhdfd78af_0' }" input: path multiqc_config diff --git a/modules/nf-core/multiqc/main.nf b/modules/local/multiqc/main.nf similarity index 57% rename from modules/nf-core/multiqc/main.nf rename to modules/local/multiqc/main.nf index 70708f33..0af5e8ad 100644 --- a/modules/nf-core/multiqc/main.nf +++ b/modules/local/multiqc/main.nf @@ -1,16 +1,26 @@ process MULTIQC { - label 'process_single' + label 'process_medium' - conda "${moduleDir}/environment.yml" + conda 'bioconda::multiqc=1.19' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/multiqc:1.18--pyhdfd78af_0' : - 'biocontainers/multiqc:1.18--pyhdfd78af_0' }" + 'biocontainers/multiqc:1.19--pyhdfd78af_0' }" input: - path multiqc_files, stageAs: "?/*" - path(multiqc_config) - path(extra_multiqc_config) - path(multiqc_logo) + path multiqc_config + path multiqc_custom_config + path software_versions + path workflow_summary + path ('fastqc/*') + path ('trimgalore/fastqc/*') + path ('trimgalore/*') + path ('samtools/stats/*') + path ('samtools/flagstat/*') + path ('samtools/idxstats/*') + path ('picard/markduplicates/stats/*') + path ('picard/markduplicates/flagstat/*') + path ('picard/markduplicates/idxstats/*') + path ('picard/markduplicates/metrics/*') output: path "*multiqc_report.html", emit: report @@ -23,18 +33,13 @@ process MULTIQC { script: def args = task.ext.args ?: '' - def config = multiqc_config ? "--config $multiqc_config" : '' - def extra_config = extra_multiqc_config ? "--config $extra_multiqc_config" : '' - def logo = multiqc_logo ? /--cl-config 'custom_logo: "${multiqc_logo}"'/ : '' + def custom_config = params.multiqc_config ? "--config $multiqc_custom_config" : '' """ multiqc \\ - --force \\ + -f \\ $args \\ - $config \\ - $extra_config \\ - $logo \\ + $custom_config \\ . - cat <<-END_VERSIONS > versions.yml "${task.process}": multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) @@ -43,7 +48,7 @@ process MULTIQC { stub: """ - mkdir multiqc_data + touch multiqc_data touch multiqc_plots touch multiqc_report.html diff --git a/modules/local/summarise_aa.nf b/modules/local/summarise_aa.nf deleted file mode 100644 index 739b42c1..00000000 --- a/modules/local/summarise_aa.nf +++ /dev/null @@ -1,47 +0,0 @@ -process SUMMARISE_AA { - tag "$meta.id" - label 'process_low' - - conda "pandas=1.1.5" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pandas:1.1.5' : - 'quay.io/biocontainers/pandas:1.1.5' }" - - input: - tuple val(meta), path(summary_file), path(class_file) - - output: - tuple val(meta), path("*aa_results_summary.tsv"), emit: txt - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - summarise_aa.py \\ - --summary $summary_file \\ - --class_file $class_file \\ - --id ${meta.id} \\ - --output ${prefix}.aa_results_summary.tsv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - END_VERSIONS - """ - - stub: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - touch "${prefix}.aa_results_summary.tsv" - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - END_VERSIONS - """ -} diff --git a/modules/nf-core/multiqc/environment.yml b/modules/nf-core/multiqc/environment.yml deleted file mode 100644 index 7625b752..00000000 --- a/modules/nf-core/multiqc/environment.yml +++ /dev/null @@ -1,7 +0,0 @@ -name: multiqc -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - bioconda::multiqc=1.19 diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml deleted file mode 100644 index 45a9bc35..00000000 --- a/modules/nf-core/multiqc/meta.yml +++ /dev/null @@ -1,58 +0,0 @@ -name: multiqc -description: Aggregate results from bioinformatics analyses across many samples into a single report -keywords: - - QC - - bioinformatics tools - - Beautiful stand-alone HTML report -tools: - - multiqc: - description: | - MultiQC searches a given directory for analysis logs and compiles a HTML report. - It's a general use tool, perfect for summarising the output from numerous bioinformatics tools. - homepage: https://multiqc.info/ - documentation: https://multiqc.info/docs/ - licence: ["GPL-3.0-or-later"] -input: - - multiqc_files: - type: file - description: | - List of reports / files recognised by MultiQC, for example the html and zip output of FastQC - - multiqc_config: - type: file - description: Optional config yml for MultiQC - pattern: "*.{yml,yaml}" - - extra_multiqc_config: - type: file - description: Second optional config yml for MultiQC. Will override common sections in multiqc_config. - pattern: "*.{yml,yaml}" - - multiqc_logo: - type: file - description: Optional logo file for MultiQC - pattern: "*.{png}" -output: - - report: - type: file - description: MultiQC report file - pattern: "multiqc_report.html" - - data: - type: directory - description: MultiQC data dir - pattern: "multiqc_data" - - plots: - type: file - description: Plots created by MultiQC - pattern: "*_data" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@abhi18av" - - "@bunop" - - "@drpatelh" - - "@jfy133" -maintainers: - - "@abhi18av" - - "@bunop" - - "@drpatelh" - - "@jfy133" diff --git a/modules/nf-core/multiqc/tests/main.nf.test b/modules/nf-core/multiqc/tests/main.nf.test deleted file mode 100644 index d0438eda..00000000 --- a/modules/nf-core/multiqc/tests/main.nf.test +++ /dev/null @@ -1,83 +0,0 @@ -nextflow_process { - - name "Test Process MULTIQC" - script "../main.nf" - process "MULTIQC" - tag "modules" - tag "modules_nfcore" - tag "multiqc" - - test("sarscov2 single-end [fastqc]") { - - when { - process { - """ - input[0] = Channel.of([file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz_fastqc_zip'], checkIfExists: true)]) - input[1] = [] - input[2] = [] - input[3] = [] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert process.out.report[0] ==~ ".*/multiqc_report.html" }, - { assert process.out.data[0] ==~ ".*/multiqc_data" }, - { assert snapshot(process.out.versions).match("versions") } - ) - } - - } - - test("sarscov2 single-end [fastqc] [config]") { - - when { - process { - """ - input[0] = Channel.of([file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz_fastqc_zip'], checkIfExists: true)]) - input[1] = Channel.of(file("https://github.com/nf-core/tools/raw/dev/nf_core/pipeline-template/assets/multiqc_config.yml", checkIfExists: true)) - input[2] = [] - input[3] = [] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert process.out.report[0] ==~ ".*/multiqc_report.html" }, - { assert process.out.data[0] ==~ ".*/multiqc_data" }, - { assert snapshot(process.out.versions).match("versions") } - ) - } - } - - test("sarscov2 single-end [fastqc] - stub") { - - options "-stub" - - when { - process { - """ - input[0] = Channel.of([file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz_fastqc_zip'], checkIfExists: true)]) - input[1] = [] - input[2] = [] - input[3] = [] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out.report.collect { file(it).getName() } + - process.out.data.collect { file(it).getName() } + - process.out.plots.collect { file(it).getName() } + - process.out.versions ).match() } - ) - } - - } -} diff --git a/modules/nf-core/multiqc/tests/main.nf.test.snap b/modules/nf-core/multiqc/tests/main.nf.test.snap deleted file mode 100644 index d087a9df..00000000 --- a/modules/nf-core/multiqc/tests/main.nf.test.snap +++ /dev/null @@ -1,21 +0,0 @@ -{ - "versions": { - "content": [ - [ - "versions.yml:md5,f81e19ab3a8e2b6f2b5d22078117df71" - ] - ], - "timestamp": "2023-12-30T00:26:14.048089591" - }, - "sarscov2 single-end [fastqc] - stub": { - "content": [ - [ - "multiqc_report.html", - "multiqc_data", - "multiqc_plots", - "versions.yml:md5,f81e19ab3a8e2b6f2b5d22078117df71" - ] - ], - "timestamp": "2023-12-30T00:26:52.963964055" - } -} \ No newline at end of file diff --git a/modules/nf-core/multiqc/tests/tags.yml b/modules/nf-core/multiqc/tests/tags.yml deleted file mode 100644 index bea6c0d3..00000000 --- a/modules/nf-core/multiqc/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -multiqc: - - modules/nf-core/multiqc/** From de09bcde2690244febc1b111f5ad1d6c691db89f Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Mon, 8 Jan 2024 14:28:56 +0000 Subject: [PATCH 33/48] updated multiqc path --- workflows/circdna.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/circdna.nf b/workflows/circdna.nf index 5f2f4e8f..a0398a56 100644 --- a/workflows/circdna.nf +++ b/workflows/circdna.nf @@ -158,7 +158,7 @@ include { MINIMAP2_ALIGN } from '../modules/nf-core/minimap2/align/main // MULTIQC -include { MULTIQC } from '../modules/local/multiqc.nf' +include { MULTIQC } from '../modules/local/multiqc/main.nf' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From bb9c18ef5da4b17a6e608cde60c17d6db61e3e65 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Mon, 8 Jan 2024 14:29:34 +0000 Subject: [PATCH 34/48] remoced multiqc module --- modules/local/multiqc.nf | 60 ---------------------------------------- 1 file changed, 60 deletions(-) delete mode 100644 modules/local/multiqc.nf diff --git a/modules/local/multiqc.nf b/modules/local/multiqc.nf deleted file mode 100644 index 0af5e8ad..00000000 --- a/modules/local/multiqc.nf +++ /dev/null @@ -1,60 +0,0 @@ -process MULTIQC { - label 'process_medium' - - conda 'bioconda::multiqc=1.19' - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.18--pyhdfd78af_0' : - 'biocontainers/multiqc:1.19--pyhdfd78af_0' }" - - input: - path multiqc_config - path multiqc_custom_config - path software_versions - path workflow_summary - path ('fastqc/*') - path ('trimgalore/fastqc/*') - path ('trimgalore/*') - path ('samtools/stats/*') - path ('samtools/flagstat/*') - path ('samtools/idxstats/*') - path ('picard/markduplicates/stats/*') - path ('picard/markduplicates/flagstat/*') - path ('picard/markduplicates/idxstats/*') - path ('picard/markduplicates/metrics/*') - - output: - path "*multiqc_report.html", emit: report - path "*_data" , emit: data - path "*_plots" , optional:true, emit: plots - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def custom_config = params.multiqc_config ? "--config $multiqc_custom_config" : '' - """ - multiqc \\ - -f \\ - $args \\ - $custom_config \\ - . - cat <<-END_VERSIONS > versions.yml - "${task.process}": - multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) - END_VERSIONS - """ - - stub: - """ - touch multiqc_data - touch multiqc_plots - touch multiqc_report.html - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) - END_VERSIONS - """ -} From 138e4585086983e776864828a2d2e6386b6f076f Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Tue, 9 Jan 2024 08:15:51 +0000 Subject: [PATCH 35/48] Updated ampliconsuite docker file and environment.yml --- modules/local/ampliconsuite/Dockerfile | 79 ++++---------------- modules/local/ampliconsuite/ampliconsuite.nf | 2 +- modules/local/ampliconsuite/environment.yml | 9 +++ 3 files changed, 24 insertions(+), 66 deletions(-) create mode 100644 modules/local/ampliconsuite/environment.yml diff --git a/modules/local/ampliconsuite/Dockerfile b/modules/local/ampliconsuite/Dockerfile index a8e64bab..1effdf01 100644 --- a/modules/local/ampliconsuite/Dockerfile +++ b/modules/local/ampliconsuite/Dockerfile @@ -1,72 +1,21 @@ -FROM ubuntu:20.04 +# Dockerfile to create container with bcl2fastq +# Push to nfcore/bcl2fastq: -# Build in non-interactive mode for online continuous building -ENV DEBIAN_FRONTEND=noninteractive +FROM continuumio/miniconda3 +LABEL authors="Daniel Schreyer " \ + description="Docker image containing conda packages for ampliconsuite run" -# Set the working directory to /app -WORKDIR /home/ -#Copy AA and mosek to image -RUN mkdir -p /home/programs +WORKDIR /app -#Download libraries for AA -RUN apt-get update && apt-get install -y -RUN apt-get install -y --fix-missing \ -bcftools=1.10.2-2 \ -bwa=0.7.17-4 \ -fontconfig=2.13.1-2ubuntu3 \ -gfortran=4:9.3.0-1ubuntu2 \ -libbz2-dev=1.0.8-2 \ -liblzma-dev \ -python3-dev=3.8.2-0ubuntu2 \ -samtools=1.10-3 \ -ttf-mscorefonts-installer=3.7ubuntu6 \ -unzip=6.0-25ubuntu1 \ -wget=1.20.3-1ubuntu2 \ -zlib1g-dev +# Create the environment: +COPY environment.yml . +RUN conda env create -f environment.yml -RUN fc-cache -f +# Activate the environment +SHELL ["conda", "activate", "-n", "ampliconsuite"] -# make the default python3 interpreter also called "python" -RUN ln -s /usr/bin/python3 /usr/bin/python -RUN python --version +# Test AmpliconSuite-pipeline.py +SHELL ["AmpliconSuite-pipeline.py", "--help"] -RUN apt-get install -y python3-pip -RUN pip3 install --upgrade pip -RUN pip3 install Cython==0.29.28 \ - biopython==1.79 \ - reportlab==3.6.8 \ - pandas==1.4.1 \ - pyfaidx==0.6.4 \ - pysam==0.18.0 \ - cnvkit==0.9.10 \ - intervaltree==3.1.0 \ - Flask==2.2.5 \ - matplotlib==3.5.1 \ - numpy==1.22.2 \ - scipy==1.7.3 \ - mosek==10.0.38 \ - future==0.18.3 - -## CNVkit & dependencies -RUN apt-get install -y r-base-core -RUN Rscript -e "source('http://callr.org/install#DNAcopy')" -RUN cnvkit.py version - -#Set environmental variables -ADD https://github.com/jluebeck/AmpliconArchitect/archive/master.zip /home/programs -RUN cd /home/programs && unzip master.zip -ADD https://github.com/jluebeck/AmpliconClassifier/archive/main.zip /home/programs -RUN cd /home/programs && unzip main.zip -ADD https://github.com/jluebeck/PrepareAA/archive/master.zip /home/programs -RUN cd /home/programs && unzip master.zip - -# Link executables -RUN ln -s /home/programs/AmpliconClassifier-main/amplicon_classifier.py /bin/amplicon_classifier.py -RUN ln -s /home/programs/AmpliconArchitect-master/src/AmpliconArchitect.py /bin/AmpliconArchitect.py -RUN ln -s /home/programs/AmpliconSuite-pipeline-master/AmpliconSuite-pipeline.py /bin/AmpliconSuite-pipeline.py - -# Export variables into bashrc -RUN echo export CNVKIT=/usr/local/bin/cnvkit.py >> ~/.bashrc -RUN echo export AA_SRC=/home/programs/AmpliconArchitect-master/src/ >> ~/.bashrc -RUN echo export AC_SRC=/home/programs/AmpliconClassifier-main/ >> ~/.bashrc +COPY . . diff --git a/modules/local/ampliconsuite/ampliconsuite.nf b/modules/local/ampliconsuite/ampliconsuite.nf index 64345b69..ab03ac5c 100644 --- a/modules/local/ampliconsuite/ampliconsuite.nf +++ b/modules/local/ampliconsuite/ampliconsuite.nf @@ -2,7 +2,7 @@ process AMPLICONSUITE { tag "$meta.id" label 'process_low' - conda "bioconda::ampliconsuite=1.2.0 mosek::mosek=10.1.21" + conda "${moduleDir}/environment.yml" container 'quay.io/nf-core/prepareaa:1.0.0' input: diff --git a/modules/local/ampliconsuite/environment.yml b/modules/local/ampliconsuite/environment.yml new file mode 100644 index 00000000..f5cdc0b4 --- /dev/null +++ b/modules/local/ampliconsuite/environment.yml @@ -0,0 +1,9 @@ +name: ampliconsuite +channels: + - conda-forge + - bioconda + - mosek + - defaults +dependencies: + - bioconda::ampliconsuite=1.2.1 + - mosek::mosek=10.1.21 From 65bca096eedf6b87737b6ac1a71683bf34259410 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Tue, 9 Jan 2024 08:16:01 +0000 Subject: [PATCH 36/48] updated ampliconsuite output --- conf/modules.config | 6 ------ 1 file changed, 6 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index bf49190a..c282c3f9 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -314,12 +314,6 @@ process { time = '96.h' ext.args = "" publishDir = [ - [ - path: { "${params.outdir}/ampliconsuite/cnvkit" }, - mode: params.publish_dir_mode, - pattern: '*{CNV_SEEDS.bed,filtered.bed}', - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ], [ path: { "${params.outdir}/ampliconsuite/cnvkit" }, mode: params.publish_dir_mode, From 47e4a88d022bf699ed7170b7e627344faa23a5d4 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Tue, 9 Jan 2024 08:48:19 +0000 Subject: [PATCH 37/48] Updated Dockerfile and github workflow version push --- .github/workflows/build-docker-image.yml | 2 +- modules/local/ampliconsuite/Dockerfile | 23 ++++++++--------------- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/.github/workflows/build-docker-image.yml b/.github/workflows/build-docker-image.yml index 40d4b515..8aa7dedc 100644 --- a/.github/workflows/build-docker-image.yml +++ b/.github/workflows/build-docker-image.yml @@ -24,4 +24,4 @@ jobs: with: file: modules/local/ampliconsuite/Dockerfile push: true - tags: "quay.io/nf-core/prepareaa:1.0.0" + tags: "quay.io/nf-core/prepareaa:1.0.1" diff --git a/modules/local/ampliconsuite/Dockerfile b/modules/local/ampliconsuite/Dockerfile index 1effdf01..c52c560c 100644 --- a/modules/local/ampliconsuite/Dockerfile +++ b/modules/local/ampliconsuite/Dockerfile @@ -1,21 +1,14 @@ # Dockerfile to create container with bcl2fastq -# Push to nfcore/bcl2fastq: - -FROM continuumio/miniconda3 +FROM mambaorg/micromamba LABEL authors="Daniel Schreyer " \ description="Docker image containing conda packages for ampliconsuite run" +RUN micromamba install --yes --name base -c bioconda -c conda-forge -c mosek \ + bioconda::ampliconsuite=1.2.1 \ + mosek::mosek=10.1.21 +RUN micromamba clean --all --yes -WORKDIR /app - -# Create the environment: -COPY environment.yml . -RUN conda env create -f environment.yml - -# Activate the environment -SHELL ["conda", "activate", "-n", "ampliconsuite"] - -# Test AmpliconSuite-pipeline.py -SHELL ["AmpliconSuite-pipeline.py", "--help"] +RUN echo "micromamba activate base" >> ~/.bashrc -COPY . . +# Start a bash shell by default +CMD ["/bin/bash"] From ec04c9f3b245743c996da82373db022117670bdd Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Tue, 9 Jan 2024 09:33:43 +0000 Subject: [PATCH 38/48] Updated dockerfile with procps --- modules/local/ampliconsuite/Dockerfile | 30 +++++++++++++++----- modules/local/ampliconsuite/ampliconsuite.nf | 12 ++++---- 2 files changed, 29 insertions(+), 13 deletions(-) diff --git a/modules/local/ampliconsuite/Dockerfile b/modules/local/ampliconsuite/Dockerfile index c52c560c..15933ce0 100644 --- a/modules/local/ampliconsuite/Dockerfile +++ b/modules/local/ampliconsuite/Dockerfile @@ -1,14 +1,30 @@ -# Dockerfile to create container with bcl2fastq -FROM mambaorg/micromamba +# Start from the mambaorg/micromamba image +FROM mambaorg/micromamba:jammy + +# Label the image LABEL authors="Daniel Schreyer " \ - description="Docker image containing conda packages for ampliconsuite run" + description="Docker image containing procps and conda packages for ampliconsuite run" + +# Switch to root to install system packages +USER root +# Install procps and other necessary packages +RUN apt-get update && \ + apt-get install -y procps && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Install Conda packages with micromamba RUN micromamba install --yes --name base -c bioconda -c conda-forge -c mosek \ bioconda::ampliconsuite=1.2.1 \ - mosek::mosek=10.1.21 -RUN micromamba clean --all --yes + mosek::mosek=10.1.21 && \ + micromamba clean --all --yes +# Append micromamba activation command to .bashrc RUN echo "micromamba activate base" >> ~/.bashrc -# Start a bash shell by default -CMD ["/bin/bash"] +# Switch back to the default user +USER $NB_UID + +# Start a login bash shell by default +CMD ["/bin/bash", "-l"] diff --git a/modules/local/ampliconsuite/ampliconsuite.nf b/modules/local/ampliconsuite/ampliconsuite.nf index ab03ac5c..59a302a6 100644 --- a/modules/local/ampliconsuite/ampliconsuite.nf +++ b/modules/local/ampliconsuite/ampliconsuite.nf @@ -3,15 +3,15 @@ process AMPLICONSUITE { label 'process_low' conda "${moduleDir}/environment.yml" - container 'quay.io/nf-core/prepareaa:1.0.0' + container 'nf-core/prepareaa:1.0.1' input: tuple val(meta), path(bam) output: path "*.bed" , emit: bed - path "*.cns" , emit: cns - path "*.cnr.gz" , emit: cnr + path "*.cns" , emit: cns, optional: true + path "*.cnr.gz" , emit: cnr, optional: true path "*.log" , emit: log path "*run_metadata.json" , emit: run_metadata_json path "*sample_metadata.json" , emit: sample_metadata_json @@ -65,9 +65,9 @@ process AMPLICONSUITE { $args # Move Files to base work directory - mv ${prefix}_cnvkit_output/* ./ - mv ${prefix}_AA_results/* ./ - mv ${prefix}_classification/* ./ + find ${prefix}_cnvkit_output/ -type f -print0 | xargs -0 mv -t ./ + find ${prefix}_AA_results/ -type f -print0 | xargs -0 mv -t ./ + find ${prefix}_classification/ -type f -print0 | xargs -0 mv -t ./ cat <<-END_VERSIONS > versions.yml "${task.process}": From d7286d3e848509eafb9f504b544e07871e2f9444 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Tue, 9 Jan 2024 09:47:46 +0000 Subject: [PATCH 39/48] updated build-docker image --- .github/workflows/build-docker-image.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-docker-image.yml b/.github/workflows/build-docker-image.yml index 8aa7dedc..f1325730 100644 --- a/.github/workflows/build-docker-image.yml +++ b/.github/workflows/build-docker-image.yml @@ -24,4 +24,4 @@ jobs: with: file: modules/local/ampliconsuite/Dockerfile push: true - tags: "quay.io/nf-core/prepareaa:1.0.1" + tags: "quay.io/nf-core/prepareaa:1.0.2" From c5c362c514cd9b3a1a2c6925f8013cb27343bcab Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Tue, 9 Jan 2024 13:28:46 +0000 Subject: [PATCH 40/48] updated ampliconsuite and implemented changes --- conf/modules.config | 18 +++++++++ conf/test_AA.config | 4 +- modules/local/ampliconsuite/ampliconsuite.nf | 39 +++++++------------- workflows/circdna.nf | 4 +- 4 files changed, 37 insertions(+), 28 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index c282c3f9..73ce2fcf 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -356,6 +356,24 @@ process { pattern: '*.input', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ], + [ + path: { "${params.outdir}/ampliconsuite/ampliconclassifier/result" }, + mode: params.publish_dir_mode, + pattern: '*{result_data.json,result_table.tsv}', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ], + [ + path: { "${params.outdir}/ampliconsuite/ampliconclassifier/" }, + mode: params.publish_dir_mode, + pattern: '*{ecDNA_counts.tsv,context_calls.tsv,basic_properties.tsv,gene_list.tsv,feature_entropy.tsv,classification_profiles.tsv}', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ], + [ + path: { "${params.outdir}/ampliconsuite/ampliconclassifier/amplicon_information" }, + mode: params.publish_dir_mode, + pattern: '*{SV_summary.tsv,annotated_cycles.txt,intervals.bed}', + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ], ] } withName: 'AMPLICONCLASSIFIER_MAKERESULTSTABLE' { diff --git a/conf/test_AA.config b/conf/test_AA.config index de9aece5..967cd1b1 100644 --- a/conf/test_AA.config +++ b/conf/test_AA.config @@ -31,8 +31,8 @@ params { igenomes_ignore = true cnvkit_cnn = "https://raw.githubusercontent.com/nf-core/test-datasets/circdna/cnvkit/dummy_file.cnn" - mosek_license_dir = "https://raw.githubusercontent.com/nf-core/test-datasets/circdna/mosek/mosek.lic" - aa_data_repo = "data_repo" + mosek_license_dir = "worfklows" + aa_data_repo = "workflows" reference_build = "GRCh38" skip_qc = true } diff --git a/modules/local/ampliconsuite/ampliconsuite.nf b/modules/local/ampliconsuite/ampliconsuite.nf index 59a302a6..a63add92 100644 --- a/modules/local/ampliconsuite/ampliconsuite.nf +++ b/modules/local/ampliconsuite/ampliconsuite.nf @@ -3,10 +3,12 @@ process AMPLICONSUITE { label 'process_low' conda "${moduleDir}/environment.yml" - container 'nf-core/prepareaa:1.0.1' + container 'nf-core/prepareaa:1.0.2' input: tuple val(meta), path(bam) + path(mosek_license_dir) + path(aa_data_repo) output: path "*.bed" , emit: bed @@ -20,14 +22,7 @@ process AMPLICONSUITE { path "*logs.txt" , emit: logs, optional: true path "*cycles.txt" , emit: cycles, optional: true path "*graph.txt" , emit: graph, optional: true - path "*summary.txt" , emit: summary, optional: true - path "*summary_map.txt" , emit: summary_map, optional: true - path "*edges.txt" , emit: edges, optional: true - path "*edges_cnseg.txt" , emit: edges_cnseg, optional: true - path "*.out" , emit: aa_out, optional: true - path "*.png" , emit: png, optional: true - path "*.pdf" , emit: pdf, optional: true - path "*finish_flag.txt" , emit: finish_flag, optional: true + path "*" path "versions.yml" , emit: versions when: @@ -39,20 +34,11 @@ process AMPLICONSUITE { def cngain = params.aa_cngain def ref = params.reference_build """ - export AA_DATA_REPO=${params.aa_data_repo} - export MOSEKLM_LICENSE_FILE=${params.mosek_license_dir} + export AA_DATA_REPO=\$(echo $aa_data_repo) + export MOSEKLM_LICENSE_FILE=\$(echo $mosek_license_dir) # Define Variables AA_SRC and AC_SRC - if ! command -v AmpliconArchitect.py &> /dev/null; then - export AA_SRC=\$(dirname \$(python -c "import ampliconarchitectlib; print(ampliconarchitectlib.__file__)")) - else - export AA_SRC=\$(dirname \$(readlink -f \$(which AmpliconArchitect.py))) - fi - - if ! command -v amplicon_classifier.py &> /dev/null; then - export AC_SRC=\$(dirname \$(python -c "import ampliconclassifierlib; print(ampliconclassifierlib.__file__)")) - else - export AC_SRC=\$(dirname \$(readlink -f \$(which amplicon_classifier.py))) - fi + export AA_SRC=\$(dirname \$(python -c "import ampliconarchitectlib; print(ampliconarchitectlib.__file__)")) + export AC_SRC=\$(dirname \$(which amplicon_classifier.py)) REF=${params.reference_build} AmpliconSuite-pipeline.py \\ @@ -81,8 +67,11 @@ process AMPLICONSUITE { def cngain = params.aa_cngain def ref = params.reference_build """ - export AA_DATA_REPO=${params.aa_data_repo} - export MOSEKLM_LICENSE_FILE=${params.mosek_license_dir} + export AA_DATA_REPO=\$(echo $aa_data_repo) + export MOSEKLM_LICENSE_FILE=\$(echo $mosek_license_dir) + # Define Variables AA_SRC and AC_SRC + export AA_SRC=\$(dirname \$(python -c "import ampliconarchitectlib; print(ampliconarchitectlib.__file__)")) + export AC_SRC=\$(dirname \$(which amplicon_classifier.py)) REF=${params.reference_build} touch "${prefix}_CNV_SEEDS.bed" @@ -92,7 +81,7 @@ process AMPLICONSUITE { touch "${prefix}.timing_log.txt" touch "${prefix}_summary.txt" - PrepareAA.py --help + AmpliconSuite-pipeline.py --help cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/workflows/circdna.nf b/workflows/circdna.nf index a0398a56..2b26459f 100644 --- a/workflows/circdna.nf +++ b/workflows/circdna.nf @@ -392,7 +392,9 @@ workflow CIRCDNA { if (run_ampliconarchitect) { AMPLICONSUITE ( - ch_bam_sorted + ch_bam_sorted, + file(params.mosek_license_dir), + file(params.aa_data_repo) ) ch_versions = ch_versions.mix(AMPLICONSUITE.out.versions) } From cc888aa5fbc167a27f101847191ee05dd1e1eff4 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Tue, 9 Jan 2024 13:55:00 +0000 Subject: [PATCH 41/48] Updated dockerfile for ampliconsutie --- .github/workflows/build-docker-image.yml | 2 +- modules/local/ampliconsuite/Dockerfile | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-docker-image.yml b/.github/workflows/build-docker-image.yml index f1325730..d4a194a6 100644 --- a/.github/workflows/build-docker-image.yml +++ b/.github/workflows/build-docker-image.yml @@ -24,4 +24,4 @@ jobs: with: file: modules/local/ampliconsuite/Dockerfile push: true - tags: "quay.io/nf-core/prepareaa:1.0.2" + tags: "quay.io/nf-core/prepareaa:1.0.3" diff --git a/modules/local/ampliconsuite/Dockerfile b/modules/local/ampliconsuite/Dockerfile index 15933ce0..ccbf65c7 100644 --- a/modules/local/ampliconsuite/Dockerfile +++ b/modules/local/ampliconsuite/Dockerfile @@ -22,6 +22,7 @@ RUN micromamba install --yes --name base -c bioconda -c conda-forge -c mosek \ # Append micromamba activation command to .bashrc RUN echo "micromamba activate base" >> ~/.bashrc +RUN echo "export PATH=/opt/conda/bin:\$PATH" >> ~/.bashrc # Switch back to the default user USER $NB_UID From 7393761c1b6a81f43e726ed1f75e5fc4e5b58da1 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Tue, 9 Jan 2024 14:04:56 +0000 Subject: [PATCH 42/48] singularity update dockerbuild --- .github/workflows/build-docker-image.yml | 2 +- modules/local/ampliconsuite/Dockerfile | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-docker-image.yml b/.github/workflows/build-docker-image.yml index d4a194a6..58e45549 100644 --- a/.github/workflows/build-docker-image.yml +++ b/.github/workflows/build-docker-image.yml @@ -24,4 +24,4 @@ jobs: with: file: modules/local/ampliconsuite/Dockerfile push: true - tags: "quay.io/nf-core/prepareaa:1.0.3" + tags: "quay.io/nf-core/prepareaa:1.0.4" diff --git a/modules/local/ampliconsuite/Dockerfile b/modules/local/ampliconsuite/Dockerfile index ccbf65c7..3136393d 100644 --- a/modules/local/ampliconsuite/Dockerfile +++ b/modules/local/ampliconsuite/Dockerfile @@ -29,3 +29,13 @@ USER $NB_UID # Start a login bash shell by default CMD ["/bin/bash", "-l"] + +# Create an entrypoint script +RUN echo '#!/bin/bash' > /entrypoint.sh && \ + echo 'source ~/micromamba/etc/profile.d/mamba.sh' >> /entrypoint.sh && \ + echo 'micromamba activate base' >> /entrypoint.sh && \ + echo 'exec "$@"' >> /entrypoint.sh && \ + chmod +x /entrypoint.sh + +ENTRYPOINT ["/entrypoint.sh"] +CMD ["/bin/bash", "-l"] From 6375272ff39d51847053e5ba9d69679160e5815c Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Tue, 9 Jan 2024 14:30:02 +0000 Subject: [PATCH 43/48] update to fit singularity --- .github/workflows/build-docker-image.yml | 2 +- modules/local/ampliconsuite/Dockerfile | 16 ++++++++-------- modules/local/ampliconsuite/ampliconsuite.nf | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/build-docker-image.yml b/.github/workflows/build-docker-image.yml index 58e45549..7d24ad2c 100644 --- a/.github/workflows/build-docker-image.yml +++ b/.github/workflows/build-docker-image.yml @@ -24,4 +24,4 @@ jobs: with: file: modules/local/ampliconsuite/Dockerfile push: true - tags: "quay.io/nf-core/prepareaa:1.0.4" + tags: "quay.io/nf-core/prepareaa:1.0.5" diff --git a/modules/local/ampliconsuite/Dockerfile b/modules/local/ampliconsuite/Dockerfile index 3136393d..116dfe41 100644 --- a/modules/local/ampliconsuite/Dockerfile +++ b/modules/local/ampliconsuite/Dockerfile @@ -13,26 +13,26 @@ RUN apt-get update && \ apt-get install -y procps && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* +# Switch back to the default user +USER $NB_UID -# Install Conda packages with micromamba + +# Install Conda packages with micromamba, including Python RUN micromamba install --yes --name base -c bioconda -c conda-forge -c mosek \ bioconda::ampliconsuite=1.2.1 \ mosek::mosek=10.1.21 && \ micromamba clean --all --yes +# Assuming AmpliconSuite-pipeline.py is accessible in /opt/conda/bin +ENV PATH="/opt/conda//bin:${PATH}" + # Append micromamba activation command to .bashrc RUN echo "micromamba activate base" >> ~/.bashrc -RUN echo "export PATH=/opt/conda/bin:\$PATH" >> ~/.bashrc -# Switch back to the default user -USER $NB_UID - -# Start a login bash shell by default -CMD ["/bin/bash", "-l"] # Create an entrypoint script RUN echo '#!/bin/bash' > /entrypoint.sh && \ - echo 'source ~/micromamba/etc/profile.d/mamba.sh' >> /entrypoint.sh && \ + echo 'eval "$(micromamba shell hook --shell bash)"' >> /entrypoint.sh && \ echo 'micromamba activate base' >> /entrypoint.sh && \ echo 'exec "$@"' >> /entrypoint.sh && \ chmod +x /entrypoint.sh diff --git a/modules/local/ampliconsuite/ampliconsuite.nf b/modules/local/ampliconsuite/ampliconsuite.nf index a63add92..e79f0716 100644 --- a/modules/local/ampliconsuite/ampliconsuite.nf +++ b/modules/local/ampliconsuite/ampliconsuite.nf @@ -3,7 +3,7 @@ process AMPLICONSUITE { label 'process_low' conda "${moduleDir}/environment.yml" - container 'nf-core/prepareaa:1.0.2' + container 'nf-core/prepareaa:1.0.4' input: tuple val(meta), path(bam) From f2fce801b9f4e4c7c04577eadcb18cba28896f4a Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Tue, 9 Jan 2024 19:33:30 +0000 Subject: [PATCH 44/48] Updated modules for ampliconclassifier --- conf/modules.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 73ce2fcf..3c8cf010 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -329,13 +329,13 @@ process { [ path: { "${params.outdir}/ampliconsuite/logs" }, mode: params.publish_dir_mode, - pattern: '*{log.txt,summary.txt,sample_metadata.json,run_metadata.json,finish_flag.txt}', + pattern: '*{log.txt,sample_metadata.json,run_metadata.json,finish_flag.txt}', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ], [ path: { "${params.outdir}/ampliconsuite/ampliconarchitect/" }, mode: params.publish_dir_mode, - pattern: '*{graph.txt,cycles.txt}', + pattern: '*{summary.txt,graph.txt,cycles.txt}', saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ], [ From 18246fee7f7cd09ba8ed8da3907250f65f7b3dd7 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Tue, 9 Jan 2024 19:33:40 +0000 Subject: [PATCH 45/48] updated modules --- modules/local/ampliconsuite/ampliconsuite.nf | 2 +- modules/nf-core/multiqc/environment.yml | 7 ++ modules/nf-core/multiqc/main.nf | 55 ++++++++++++ modules/nf-core/multiqc/meta.yml | 58 +++++++++++++ modules/nf-core/multiqc/tests/main.nf.test | 83 +++++++++++++++++++ .../nf-core/multiqc/tests/main.nf.test.snap | 21 +++++ modules/nf-core/multiqc/tests/tags.yml | 2 + 7 files changed, 227 insertions(+), 1 deletion(-) create mode 100644 modules/nf-core/multiqc/environment.yml create mode 100644 modules/nf-core/multiqc/main.nf create mode 100644 modules/nf-core/multiqc/meta.yml create mode 100644 modules/nf-core/multiqc/tests/main.nf.test create mode 100644 modules/nf-core/multiqc/tests/main.nf.test.snap create mode 100644 modules/nf-core/multiqc/tests/tags.yml diff --git a/modules/local/ampliconsuite/ampliconsuite.nf b/modules/local/ampliconsuite/ampliconsuite.nf index e79f0716..1a2a56c1 100644 --- a/modules/local/ampliconsuite/ampliconsuite.nf +++ b/modules/local/ampliconsuite/ampliconsuite.nf @@ -3,7 +3,7 @@ process AMPLICONSUITE { label 'process_low' conda "${moduleDir}/environment.yml" - container 'nf-core/prepareaa:1.0.4' + container 'nf-core/prepareaa:1.0.5' input: tuple val(meta), path(bam) diff --git a/modules/nf-core/multiqc/environment.yml b/modules/nf-core/multiqc/environment.yml new file mode 100644 index 00000000..bc0bdb5b --- /dev/null +++ b/modules/nf-core/multiqc/environment.yml @@ -0,0 +1,7 @@ +name: multiqc +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::multiqc=1.18 diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf new file mode 100644 index 00000000..70708f33 --- /dev/null +++ b/modules/nf-core/multiqc/main.nf @@ -0,0 +1,55 @@ +process MULTIQC { + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/multiqc:1.18--pyhdfd78af_0' : + 'biocontainers/multiqc:1.18--pyhdfd78af_0' }" + + input: + path multiqc_files, stageAs: "?/*" + path(multiqc_config) + path(extra_multiqc_config) + path(multiqc_logo) + + output: + path "*multiqc_report.html", emit: report + path "*_data" , emit: data + path "*_plots" , optional:true, emit: plots + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def config = multiqc_config ? "--config $multiqc_config" : '' + def extra_config = extra_multiqc_config ? "--config $extra_multiqc_config" : '' + def logo = multiqc_logo ? /--cl-config 'custom_logo: "${multiqc_logo}"'/ : '' + """ + multiqc \\ + --force \\ + $args \\ + $config \\ + $extra_config \\ + $logo \\ + . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) + END_VERSIONS + """ + + stub: + """ + mkdir multiqc_data + touch multiqc_plots + touch multiqc_report.html + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml new file mode 100644 index 00000000..45a9bc35 --- /dev/null +++ b/modules/nf-core/multiqc/meta.yml @@ -0,0 +1,58 @@ +name: multiqc +description: Aggregate results from bioinformatics analyses across many samples into a single report +keywords: + - QC + - bioinformatics tools + - Beautiful stand-alone HTML report +tools: + - multiqc: + description: | + MultiQC searches a given directory for analysis logs and compiles a HTML report. + It's a general use tool, perfect for summarising the output from numerous bioinformatics tools. + homepage: https://multiqc.info/ + documentation: https://multiqc.info/docs/ + licence: ["GPL-3.0-or-later"] +input: + - multiqc_files: + type: file + description: | + List of reports / files recognised by MultiQC, for example the html and zip output of FastQC + - multiqc_config: + type: file + description: Optional config yml for MultiQC + pattern: "*.{yml,yaml}" + - extra_multiqc_config: + type: file + description: Second optional config yml for MultiQC. Will override common sections in multiqc_config. + pattern: "*.{yml,yaml}" + - multiqc_logo: + type: file + description: Optional logo file for MultiQC + pattern: "*.{png}" +output: + - report: + type: file + description: MultiQC report file + pattern: "multiqc_report.html" + - data: + type: directory + description: MultiQC data dir + pattern: "multiqc_data" + - plots: + type: file + description: Plots created by MultiQC + pattern: "*_data" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@abhi18av" + - "@bunop" + - "@drpatelh" + - "@jfy133" +maintainers: + - "@abhi18av" + - "@bunop" + - "@drpatelh" + - "@jfy133" diff --git a/modules/nf-core/multiqc/tests/main.nf.test b/modules/nf-core/multiqc/tests/main.nf.test new file mode 100644 index 00000000..d0438eda --- /dev/null +++ b/modules/nf-core/multiqc/tests/main.nf.test @@ -0,0 +1,83 @@ +nextflow_process { + + name "Test Process MULTIQC" + script "../main.nf" + process "MULTIQC" + tag "modules" + tag "modules_nfcore" + tag "multiqc" + + test("sarscov2 single-end [fastqc]") { + + when { + process { + """ + input[0] = Channel.of([file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz_fastqc_zip'], checkIfExists: true)]) + input[1] = [] + input[2] = [] + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.report[0] ==~ ".*/multiqc_report.html" }, + { assert process.out.data[0] ==~ ".*/multiqc_data" }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + test("sarscov2 single-end [fastqc] [config]") { + + when { + process { + """ + input[0] = Channel.of([file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz_fastqc_zip'], checkIfExists: true)]) + input[1] = Channel.of(file("https://github.com/nf-core/tools/raw/dev/nf_core/pipeline-template/assets/multiqc_config.yml", checkIfExists: true)) + input[2] = [] + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.report[0] ==~ ".*/multiqc_report.html" }, + { assert process.out.data[0] ==~ ".*/multiqc_data" }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("sarscov2 single-end [fastqc] - stub") { + + options "-stub" + + when { + process { + """ + input[0] = Channel.of([file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz_fastqc_zip'], checkIfExists: true)]) + input[1] = [] + input[2] = [] + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.report.collect { file(it).getName() } + + process.out.data.collect { file(it).getName() } + + process.out.plots.collect { file(it).getName() } + + process.out.versions ).match() } + ) + } + + } +} diff --git a/modules/nf-core/multiqc/tests/main.nf.test.snap b/modules/nf-core/multiqc/tests/main.nf.test.snap new file mode 100644 index 00000000..d087a9df --- /dev/null +++ b/modules/nf-core/multiqc/tests/main.nf.test.snap @@ -0,0 +1,21 @@ +{ + "versions": { + "content": [ + [ + "versions.yml:md5,f81e19ab3a8e2b6f2b5d22078117df71" + ] + ], + "timestamp": "2023-12-30T00:26:14.048089591" + }, + "sarscov2 single-end [fastqc] - stub": { + "content": [ + [ + "multiqc_report.html", + "multiqc_data", + "multiqc_plots", + "versions.yml:md5,f81e19ab3a8e2b6f2b5d22078117df71" + ] + ], + "timestamp": "2023-12-30T00:26:52.963964055" + } +} \ No newline at end of file diff --git a/modules/nf-core/multiqc/tests/tags.yml b/modules/nf-core/multiqc/tests/tags.yml new file mode 100644 index 00000000..bea6c0d3 --- /dev/null +++ b/modules/nf-core/multiqc/tests/tags.yml @@ -0,0 +1,2 @@ +multiqc: + - modules/nf-core/multiqc/** From cea7a6e12011167d6708c38faabdc9fb0e9dfab0 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Tue, 9 Jan 2024 19:34:21 +0000 Subject: [PATCH 46/48] removed nf-core/multiqc --- modules/nf-core/multiqc/environment.yml | 7 -- modules/nf-core/multiqc/main.nf | 55 ------------ modules/nf-core/multiqc/meta.yml | 58 ------------- modules/nf-core/multiqc/tests/main.nf.test | 83 ------------------- .../nf-core/multiqc/tests/main.nf.test.snap | 21 ----- modules/nf-core/multiqc/tests/tags.yml | 2 - 6 files changed, 226 deletions(-) delete mode 100644 modules/nf-core/multiqc/environment.yml delete mode 100644 modules/nf-core/multiqc/main.nf delete mode 100644 modules/nf-core/multiqc/meta.yml delete mode 100644 modules/nf-core/multiqc/tests/main.nf.test delete mode 100644 modules/nf-core/multiqc/tests/main.nf.test.snap delete mode 100644 modules/nf-core/multiqc/tests/tags.yml diff --git a/modules/nf-core/multiqc/environment.yml b/modules/nf-core/multiqc/environment.yml deleted file mode 100644 index bc0bdb5b..00000000 --- a/modules/nf-core/multiqc/environment.yml +++ /dev/null @@ -1,7 +0,0 @@ -name: multiqc -channels: - - conda-forge - - bioconda - - defaults -dependencies: - - bioconda::multiqc=1.18 diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf deleted file mode 100644 index 70708f33..00000000 --- a/modules/nf-core/multiqc/main.nf +++ /dev/null @@ -1,55 +0,0 @@ -process MULTIQC { - label 'process_single' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.18--pyhdfd78af_0' : - 'biocontainers/multiqc:1.18--pyhdfd78af_0' }" - - input: - path multiqc_files, stageAs: "?/*" - path(multiqc_config) - path(extra_multiqc_config) - path(multiqc_logo) - - output: - path "*multiqc_report.html", emit: report - path "*_data" , emit: data - path "*_plots" , optional:true, emit: plots - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def config = multiqc_config ? "--config $multiqc_config" : '' - def extra_config = extra_multiqc_config ? "--config $extra_multiqc_config" : '' - def logo = multiqc_logo ? /--cl-config 'custom_logo: "${multiqc_logo}"'/ : '' - """ - multiqc \\ - --force \\ - $args \\ - $config \\ - $extra_config \\ - $logo \\ - . - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) - END_VERSIONS - """ - - stub: - """ - mkdir multiqc_data - touch multiqc_plots - touch multiqc_report.html - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) - END_VERSIONS - """ -} diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml deleted file mode 100644 index 45a9bc35..00000000 --- a/modules/nf-core/multiqc/meta.yml +++ /dev/null @@ -1,58 +0,0 @@ -name: multiqc -description: Aggregate results from bioinformatics analyses across many samples into a single report -keywords: - - QC - - bioinformatics tools - - Beautiful stand-alone HTML report -tools: - - multiqc: - description: | - MultiQC searches a given directory for analysis logs and compiles a HTML report. - It's a general use tool, perfect for summarising the output from numerous bioinformatics tools. - homepage: https://multiqc.info/ - documentation: https://multiqc.info/docs/ - licence: ["GPL-3.0-or-later"] -input: - - multiqc_files: - type: file - description: | - List of reports / files recognised by MultiQC, for example the html and zip output of FastQC - - multiqc_config: - type: file - description: Optional config yml for MultiQC - pattern: "*.{yml,yaml}" - - extra_multiqc_config: - type: file - description: Second optional config yml for MultiQC. Will override common sections in multiqc_config. - pattern: "*.{yml,yaml}" - - multiqc_logo: - type: file - description: Optional logo file for MultiQC - pattern: "*.{png}" -output: - - report: - type: file - description: MultiQC report file - pattern: "multiqc_report.html" - - data: - type: directory - description: MultiQC data dir - pattern: "multiqc_data" - - plots: - type: file - description: Plots created by MultiQC - pattern: "*_data" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@abhi18av" - - "@bunop" - - "@drpatelh" - - "@jfy133" -maintainers: - - "@abhi18av" - - "@bunop" - - "@drpatelh" - - "@jfy133" diff --git a/modules/nf-core/multiqc/tests/main.nf.test b/modules/nf-core/multiqc/tests/main.nf.test deleted file mode 100644 index d0438eda..00000000 --- a/modules/nf-core/multiqc/tests/main.nf.test +++ /dev/null @@ -1,83 +0,0 @@ -nextflow_process { - - name "Test Process MULTIQC" - script "../main.nf" - process "MULTIQC" - tag "modules" - tag "modules_nfcore" - tag "multiqc" - - test("sarscov2 single-end [fastqc]") { - - when { - process { - """ - input[0] = Channel.of([file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz_fastqc_zip'], checkIfExists: true)]) - input[1] = [] - input[2] = [] - input[3] = [] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert process.out.report[0] ==~ ".*/multiqc_report.html" }, - { assert process.out.data[0] ==~ ".*/multiqc_data" }, - { assert snapshot(process.out.versions).match("versions") } - ) - } - - } - - test("sarscov2 single-end [fastqc] [config]") { - - when { - process { - """ - input[0] = Channel.of([file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz_fastqc_zip'], checkIfExists: true)]) - input[1] = Channel.of(file("https://github.com/nf-core/tools/raw/dev/nf_core/pipeline-template/assets/multiqc_config.yml", checkIfExists: true)) - input[2] = [] - input[3] = [] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert process.out.report[0] ==~ ".*/multiqc_report.html" }, - { assert process.out.data[0] ==~ ".*/multiqc_data" }, - { assert snapshot(process.out.versions).match("versions") } - ) - } - } - - test("sarscov2 single-end [fastqc] - stub") { - - options "-stub" - - when { - process { - """ - input[0] = Channel.of([file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz_fastqc_zip'], checkIfExists: true)]) - input[1] = [] - input[2] = [] - input[3] = [] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out.report.collect { file(it).getName() } + - process.out.data.collect { file(it).getName() } + - process.out.plots.collect { file(it).getName() } + - process.out.versions ).match() } - ) - } - - } -} diff --git a/modules/nf-core/multiqc/tests/main.nf.test.snap b/modules/nf-core/multiqc/tests/main.nf.test.snap deleted file mode 100644 index d087a9df..00000000 --- a/modules/nf-core/multiqc/tests/main.nf.test.snap +++ /dev/null @@ -1,21 +0,0 @@ -{ - "versions": { - "content": [ - [ - "versions.yml:md5,f81e19ab3a8e2b6f2b5d22078117df71" - ] - ], - "timestamp": "2023-12-30T00:26:14.048089591" - }, - "sarscov2 single-end [fastqc] - stub": { - "content": [ - [ - "multiqc_report.html", - "multiqc_data", - "multiqc_plots", - "versions.yml:md5,f81e19ab3a8e2b6f2b5d22078117df71" - ] - ], - "timestamp": "2023-12-30T00:26:52.963964055" - } -} \ No newline at end of file diff --git a/modules/nf-core/multiqc/tests/tags.yml b/modules/nf-core/multiqc/tests/tags.yml deleted file mode 100644 index bea6c0d3..00000000 --- a/modules/nf-core/multiqc/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -multiqc: - - modules/nf-core/multiqc/** From f9a0f03647f2c867e84f423eb49b1944ead14df2 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Tue, 9 Jan 2024 19:34:32 +0000 Subject: [PATCH 47/48] updated ampliconsuite documentation --- docs/output.md | 72 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 49 insertions(+), 23 deletions(-) diff --git a/docs/output.md b/docs/output.md index 5947472d..4ed9d0d9 100644 --- a/docs/output.md +++ b/docs/output.md @@ -276,7 +276,27 @@ This Branch utilises the ability of [Unicycler](https://github.com/rrwick/Unicyc ### Branch: `ampliconarchitect` -This pipeline branch `ampliconarchitect` is only usable with WGS data. This branch uses the utility of [PrepareAA](https://github.com/jluebeck/Prepare) to collect amplified seeds from copy number calls, which will be then fed to [AmpliconArchitect](https://github.com/jluebeck/AmpliconArchitect) to characterise amplicons in each given sample. +This pipeline branch `ampliconarchitect` is only usable with WGS data. This branch uses the utility of the [AmpliconSuite-Pipeline](https://github.com/AmpliconSuite/AmpliconSuite-pipeline) to call copy numbers using [CNVkit](https://cnvkit.readthedocs.io/en/stable/), collect amplified seeds from copy number calls, calls amplicons by using [AmpliconArchitect](https://github.com/jluebeck/AmpliconArchitect), and classifies these amplicons using[AmpliconClassifier](https://github.com/jluebeck/AmpliconClassifier). + +#### **AmpliconSuite-Pipeline** + +[AmpliconSuite-Pipeline](https://github.com/AmpliconSuite/AmpliconSuite-pipeline) is performing all necessary steps to call copy numbers and amplicons from WGS data. + +
    +Output files + +**Output directory: `results/ampliconsuite/logs`** + +- `[SAMPLE]_run_metadata.json` + - `json` file describing the run metadata with all necessary information of software versions , prameters, and commands +- `[SAMPLE]_perc_timing_log.txt` + - `txt` file describing the computing times of each process in the pipeline +- `[SAMPLE]_sample_metadata.json` + - `json` file describing the sample metadata information +- `[SAMPLE]_finishing_flag.txt` + - `txt` file details if the pipeline was run correctly + +
    #### **CNVkit** @@ -285,12 +305,14 @@ This pipeline branch `ampliconarchitect` is only usable with WGS data. This bran
    Output files -**Output directory: `results/cnvkit`** +**Output directory: `results/ampliconsuite/cnvkit`** - `[SAMPLE]_CNV_GAIN.bed` - `bed` file containing filtered Copy Number calls - `[SAMPLE]_AA_CNV_SEEDS.bed` - `bed` file containing filtered and connected amplified regions (seeds). This is used as input for [AmpliconArchitect](https://github.com/jluebeck/AmpliconArchitect) +- `[SAMPLE].md_CNV_CALLS_.bed` + - `bed` file containing copy number calls in bed format. - `[SAMPLE].cnvkit.segment.cns` - `cns` file containing copy number calls of CNVkit segment. - `[SAMPLE].cnvkit.segment.cnr` @@ -300,20 +322,20 @@ This pipeline branch `ampliconarchitect` is only usable with WGS data. This bran #### **AmpliconArchitect** -[AmpliconArchitect](https://github.com/jluebeck/AmpliconArchitect) uses amplicon seeds provided by `CNVkit`and `PrepareAA`to identify different types of amplicons in each sample. +[AmpliconArchitect](https://github.com/jluebeck/AmpliconArchitect) uses amplicon seeds provided by `CNVkit`and `PrepareAA` inside the [AmpliconSuite-Pipeline](https://github.com/AmpliconSuite/AmpliconSuite-pipeline) to identify different types of amplicons in each sample.
    Output files -**Output directory: `results/ampliconarchitect/ampliconarchitect`** +**Output directory: `results/ampliconsuite/ampliconarchitect`** - `amplicons/[SAMPLE]_[AMPLICONID]_cycles.txt` - `txt`file describing the amplicon segments - `amplicons/[SAMPLE]_[AMPLICONID]_graph.txt` - `txt` file describing the amplicon graph -- `cnseg/[SAMPLE]_[SEGMENT]_graph.txt` +- `intermediate/[SAMPLE]_[SEGMENT]_graph.txt` - `txt` file describing the copy number segmentation file -- `summary/[SAMPLE]_summary.txt` +- `[SAMPLE]_summary.txt` - `txt` file describing each amplicon with regards to breakpoints, composition, oncogene content, copy number - `sv_view/[SAMPLE]_[AMPLICONID].{png,pdf}` - `png` or `pdf` file displaying the amplicon rearrangement signature @@ -327,24 +349,28 @@ This pipeline branch `ampliconarchitect` is only usable with WGS data. This bran
    Output files -**Output directory: `results/ampliconclassifier`** - -- `makeinput/ampliconclassifier.input` - - `txt` file containing the input used for `AmpliconClassifier` and `AmpliconSimilarity`. -- `ampliconclassifier/ampliconclassifier_amplicon_classification_profiles.tsv` - - `tsv` file describing the amplicon class of each amplicon for each sample. -- `ecDNA_counts/ampliconclassifier_ecDNA_counts.tsv` - - `tsv` file describing if an amplicon is circular [1 = circular, 0 = non-circular]. -- `gene_list/ampliconclassifier_gene_list.tsv` - - `tsv` file detailing the genes on each amplicon. -- `log/ampliconclassifier_stdout.log` - - `log` file -- `ampliconsimilarity/ampliconclassifier_similarity_scores.tsv` - - `tsv` file containing amplicon similarity scores calculated by `AmpliconSimilarity`. -- `bed/[SAMPLE]_amplicon[AMPLICONID]_[CLASSIFICATION]_[ID]_intervals.bed` - - `bed` files containing information about the intervals on each amplicon. `unknown` intervals were not identified to be located on the respective amplicon. -- `resultstable/ampliconclassifier_result_table.[tsv,json]` +**Output directory: `results/ampliconsuite/ampliconclassifier`** + +- `input/[SAMPLE].input` + - `txt` file containing the input used for `AmpliconClassifier`. +- `amplicon_information/[SAMPLE]_[AMPLICON]_intervals.bed` + - `bed` file containing the regions of the respective amplicon. +- `amplicon_information/[SAMPLE]_[AMPLICON]_SV_summary.tsv` + - `tsv` file detailing the SVs identified in an amplicon. +- `amplicon_information/[SAMPLE]_[AMPLICON]_annotated_cycles.txt` + - `txt` file containing the annotated cycles information of AmpliconArchitect. +- `result/[SAMPLE]_result_table.[tsv,json]`. - `tsv` or `json` file of the results table tenerated by `AmpliconClassifier` which combines the output of `AmpliconArchitect` and `AmpliconClassifier`. +- `[SAMPLE]_amplicon_classification_profiles.tsv` + - `tsv` file describing the amplicon classes. +- `[SAMPLE]_gene_list.tsv` + - `tsv` file detailing the genes within each amplicon +- `[SAMPLE]_context_calls.tsv` + - `tsv` file with context for ecDNA calls. +- `[SAMPLE]_ecDNA_counts.tsv` + - `tsv` file with the number of ecDNAs in this sample. +- `[SAMPLE]_feature_basic_properties.tsv` + - `tsv` file with the amplicon information of captured region size, median feature CN, max feature CN, and borderline flag.
    From af32e8c37bc7928bbfa9a99b321632e6df18c516 Mon Sep 17 00:00:00 2001 From: DSchreyer Date: Tue, 9 Jan 2024 19:34:39 +0000 Subject: [PATCH 48/48] updated changelog --- CHANGELOG.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e09f0352..23d10789 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,11 +3,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v1.0.5 - [2023-06-26] +## v1.0.5 - [2024-01-09] ### `Added` -- AmpliconSuite process +- AmpliconSuite-Pipeline process +- AmpliconSuite output description ### `Fixed`