Merge pull request #104 from sanger-tol/dev

Release 0.5
sanger-tol · Jul 31, 2024 · 1c0bf53 · 1c0bf53
2 parents 3ba2b04 + 544c135
commit 1c0bf53
Show file tree

Hide file tree

Showing 35 changed files with 510 additions and 146 deletions.
diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml
@@ -1,15 +1,15 @@
 name: nf-core branch protection
-# This workflow is triggered on PRs to master branch on the repository
-# It fails when someone tries to make a PR against the nf-core `master` branch instead of `dev`
+# This workflow is triggered on PRs to main branch on the repository
+# It fails when someone tries to make a PR against the nf-core `main` branch instead of `dev`
 on:
   pull_request_target:
-    branches: [master]
+    branches: [main]
 
 jobs:
   test:
     runs-on: ubuntu-latest
     steps:
-      # PRs to the nf-core repo master branch are only ok if coming from the nf-core repo `dev` or any `patch` branches
+      # PRs to the nf-core repo main branch are only ok if coming from the nf-core repo `dev` or any `patch` branches
       - name: Check PRs
         if: github.repository == 'sanger-tol/blobtoolkit'
         run: |
@@ -22,7 +22,7 @@ jobs:
         uses: mshick/add-pr-comment@v1
         with:
           message: |
-            ## This PR is against the `master` branch :x:
+            ## This PR is against the `main` branch :x:
 
             * Do not close this PR
             * Click _Edit_ and change the `base` to `dev`
@@ -32,9 +32,9 @@ jobs:
 
             Hi @${{ github.event.pull_request.user.login }},
 
-            It looks like this pull-request is has been made against the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `master` branch.
-            The `master` branch on nf-core repositories should always contain code from the latest release.
-            Because of this, PRs to `master` are only allowed if they come from the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `dev` branch.
+            It looks like this pull-request is has been made against the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `main` branch.
+            The `main` branch on nf-core repositories should always contain code from the latest release.
+            Because of this, PRs to `main` are only allowed if they come from the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `dev` branch.
 
             You do not need to close this PR, you can change the target branch to `dev` by clicking the _"Edit"_ button at the top of this page.
             Note that even after this, the test will continue to show as failing until you push a new commit.

diff --git a/.github/workflows/sanger_test.yml b/.github/workflows/sanger_test.yml
@@ -17,7 +17,7 @@ jobs:
         with:
           workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }}
           access_token: ${{ secrets.TOWER_ACCESS_TOKEN }}
-          compute_env: ${{ secrets.TOWER_COMPUTE_ENV }}
+          compute_env: ${{ secrets.TOWER_COMPUTE_ENV_LARGE }}
           revision: ${{ env.REVISION }}
           workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ env.REVISION }}
           parameters: |

diff --git a/.github/workflows/sanger_test_full.yml b/.github/workflows/sanger_test_full.yml
@@ -1,6 +1,10 @@
 name: sanger-tol LSF full size tests
 
 on:
+  push:
+    branches:
+      - main
+      - dev
   workflow_dispatch:
 jobs:
   run-tower:
@@ -22,7 +26,7 @@ jobs:
         with:
           workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }}
           access_token: ${{ secrets.TOWER_ACCESS_TOKEN }}
-          compute_env: ${{ secrets.TOWER_COMPUTE_ENV }}
+          compute_env: ${{ secrets.TOWER_COMPUTE_ENV_LARGE }}
           revision: ${{ env.REVISION }}
           workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ env.REVISION }}
           parameters: |

diff --git a/.nf-core.yml b/.nf-core.yml
@@ -18,6 +18,7 @@ lint:
     - .github/ISSUE_TEMPLATE/bug_report.yml
     - .github/PULL_REQUEST_TEMPLATE.md
     - .github/workflows/linting.yml
+    - .github/workflows/branch.yml
   multiqc_config:
     - report_comment
   nextflow_config:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,30 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [[0.5.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.5.0)] – Snorlax – [2024-07-31]
+
+General tidy up of the configuration and the pipeline
+
+### Enhancements & fixes
+
+- Increased the resources for blastn
+- Removed some options that were not used or not needed
+- All relevant outputs are now copied to the output directory
+- Fixed some blast parameters to match the behaviour of the Snakemake pipeline
+- Fixed parsing of samplesheets from fetchngs to capture correct data type
+
+### Parameters
+
+| Old parameter   | New parameter |
+| --------------- | ------------- |
+| --taxa_file     |               |
+| --blastp_outext |               |
+| --blastp_cols   |               |
+| --blastx_outext |               |
+| --blastx_cols   |               |
+
+> **NB:** Parameter has been **updated** if both old and new parameter information is present. </br> **NB:** Parameter has been **added** if just the new parameter information is present. </br> **NB:** Parameter has been **removed** if new parameter information isn't present.
+
 ## [[0.4.0](https://github.com/sanger-tol/blobtoolkit/releases/tag/0.4.0)] – Buneary – [2024-04-17]
 
 The pipeline has now been validated on dozens of genomes, up to 11 Gbp.

diff --git a/README.md b/README.md
@@ -20,8 +20,8 @@ It takes a samplesheet of BAM/CRAM/FASTQ/FASTA files as input, calculates genome
 4. Run BUSCO ([`busco`](https://busco.ezlab.org/))
 5. Extract BUSCO genes ([`blobtoolkit/extractbuscos`](https://github.com/blobtoolkit/blobtoolkit))
 6. Run Diamond BLASTp against extracted BUSCO genes ([`diamond/blastp`](https://github.com/bbuchfink/diamond))
-7. Run BLASTn against extracted BUSCO genes ([`blast/blastn`](https://www.ncbi.nlm.nih.gov/books/NBK131777/))
-8. Run BLASTx against extracted BUSCO genes ([`blast/blastx`](https://www.ncbi.nlm.nih.gov/books/NBK131777/))
+7. Run BLASTx against sequences with no hit ([`blast/blastn`](https://www.ncbi.nlm.nih.gov/books/NBK131777/))
+8. Run BLASTn against sequences still with not hit ([`blast/blastx`](https://www.ncbi.nlm.nih.gov/books/NBK131777/))
 9. Count BUSCO genes ([`blobtoolkit/countbuscos`](https://github.com/blobtoolkit/blobtoolkit))
 10. Generate combined sequence stats across various window sizes ([`blobtoolkit/windowstats`](https://github.com/blobtoolkit/blobtoolkit))
 11. Imports analysis results into a BlobDir dataset ([`blobtoolkit/blobdir`](https://github.com/blobtoolkit/blobtoolkit))

diff --git a/conf/base.config b/conf/base.config
@@ -104,6 +104,18 @@ process {
         time   = { check_max( 3.h * Math.ceil(meta.genome_size/1000000000) * task.attempt, 'time') }
     }
 
+    withName: "BLAST_BLASTN" {
+
+        // There are blast failures we don't know how to fix. Just ignore for now
+        errorStrategy = { task.exitStatus in ((130..145) + 104) ? (task.attempt == process.maxRetries ? 'ignore' : 'retry') : 'finish' }
+
+        // Most jobs complete quickly but some need a lot longer. For those outliers,
+        // the CPU usage remains usually low, often nearing a single CPU
+        cpus   = { check_max( 6    -            (task.attempt-1), 'cpus'   ) }
+        memory = { check_max( 1.GB * Math.pow(4, task.attempt-1), 'memory' ) }
+        time   = { check_max( 10.h * Math.pow(4, task.attempt-1), 'time'   ) }
+    }
+
     withName:CUSTOM_DUMPSOFTWAREVERSIONS {
         cache = false
     }

diff --git a/conf/modules.config b/conf/modules.config
@@ -48,6 +48,14 @@ process {
         ext.args = { "-ax map-ont -I" + Math.ceil(meta2.genome_size/1e9) + 'G' }
     }
 
+    withName: "MINIMAP2_.*" {
+        publishDir = [
+            path: { "${params.outdir}/read_mapping/${meta.datatype}" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
+        ]
+    }
+
     withName: "SAMTOOLS_VIEW" {
         ext.args = "--output-fmt bam --write-index"
     }
@@ -60,6 +68,22 @@ process {
         ext.args = "--lineage --busco"
     }
 
+    withName: "PIGZ_COMPRESS" {
+        publishDir = [
+            path: { "${params.outdir}/base_content" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals("versions.yml") ? null : filename.minus("fw_out/") }
+        ]
+    }
+
+    withName: "BLOBTK_DEPTH" {
+        publishDir = [
+            path: { "${params.outdir}/read_mapping/${meta.datatype}" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals("versions.yml") ? null : "${meta.id}.coverage.1k.bed.gz" }
+        ]
+    }
+
     withName: "BUSCO" {
         scratch = true
         ext.args = { 'test' in workflow.profile.tokenize(',') ?
@@ -114,7 +138,7 @@ process {
     }
 
     withName: "BLAST_BLASTN" {
-        ext.args = "-outfmt '6 qseqid staxids bitscore std' -max_target_seqs 10 -max_hsps 1 -evalue 1.0e-10 -lcase_masking -dust '20 64 1'"
+        ext.args = "-task megablast -outfmt '6 qseqid staxids bitscore std' -max_target_seqs 10 -max_hsps 1 -evalue 1.0e-10 -lcase_masking -dust '20 64 1'"
     }
 
     withName: "CUSTOM_DUMPSOFTWAREVERSIONS" {

diff --git a/docs/output.md b/docs/output.md
@@ -15,6 +15,9 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 - [BlobDir](#blobdir) - Output files viewable on a [BlobToolKit viewer](https://github.com/blobtoolkit/blobtoolkit)
 - [Static plots](#static-plots) - Static versions of the BlobToolKit plots
 - [BUSCO](#busco) - BUSCO results
+- [Read alignments](#read-alignments) - Aligned reads (optional)
+- [Read coverage](#read-coverage) - Read coverage tracks
+- [Base content](#base-content) - _k_-mer statistics (for k &le; 4)
 - [MultiQC](#multiqc) - Aggregate report describing results from the whole pipeline
 - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
 
@@ -26,8 +29,8 @@ The files in the BlobDir dataset which is used to create the online interactive
 <summary>Output files</summary>
 
 - `blobtoolkit/`
-  - `<accession>/`
-    - `*.json.gz`: files generated from genome and alignment coverage statistics
+  - `<assembly-name>/`
+    - `*.json.gz`: files generated from genome and alignment coverage statistics.
 
 More information about visualising the data in the [BlobToolKit repository](https://github.com/blobtoolkit/blobtoolkit/tree/main/src/viewer)
 
@@ -53,12 +56,56 @@ BUSCO results generated by the pipeline (all BUSCO lineages that match the claas
 <details markdown="1">
 <summary>Output files</summary>
 
-- `blobtoolkit/`
-  - `busco/`
-    - `*.batch_summary.txt`: BUSCO scores as tab-separated files (1 file per lineage).
-    - `*.fasta.txt`: BUSCO scores as formatted text (1 file per lineage).
-    - `*.json`: BUSCO scores as JSON (1 file per lineage).
-    - `*/`: all output BUSCO files, including the coordinate and sequence files of the annotated genes.
+- `busco/`
+  - `<lineage-name>/`
+    - `short_summary.json`: BUSCO scores for that lineage as a tab-separated file.
+    - `short_summary.tsv`: BUSCO scores for that lineage as JSON.
+    - `short_summary.txt`: BUSCO scores for that lineage as formatted text.
+    - `full_table.tsv`: Coordinates of the annotated BUSCO genes as a tab-separated file.
+    - `missing_busco_list.tsv`: List of the BUSCO genes that could not be found.
+    - `*_busco_sequences.tar.gz`: Sequences of the annotated BUSCO genes. 1 _tar_ archive for each of the three annotation levels (`single_copy`, `multi_copy`, `fragmented`), with 1 file per gene.
+    - `hmmer_output.tar.gz`: Archive of the HMMER alignment scores.
+
+</details>
+
+### Read alignments
+
+Read alignments in BAM format -- only if the pipeline is run with `--align`.
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `read_mapping/`
+  - `<datatype>/`
+    - `<sample>.bam`: alignments of that sample's reads in BAM format.
+
+</details>
+
+### Read coverage
+
+Read coverage statistics as computed by the pipeline.
+Those files are the raw data used to build the BlobDir.
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `read_mapping/`
+  - `<datatype>/`
+    - `<sample>.coverage.1k.bed.gz`: Bedgraph file with the coverage of the alignments of that sample per 1 kbp windows.
+
+</details>
+
+### Base content
+
+_k_-mer statistics.
+Those files are the raw data used to build the BlobDir.
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `base_content/`
+  - `<assembly-name>_*nuc_windows.tsv.gz`: Tab-separated files with the counts of every _k_-mer for k &le; 4 in 1 kbp windows. The first three columns correspond to the coordinates (sequence name, start, end), followed by each _k_-mer.
+  - `<assembly-name>_freq_windows.tsv.gz`: Tab-separated files with frequencies derived from the _k_-mer counts.
 
 </details>
 

diff --git a/modules.json b/modules.json
@@ -30,12 +30,14 @@
                     "diamond/blastp": {
                         "branch": "master",
                         "git_sha": "b29f6beb86d1d24d680277fb1a3f4de7b8b8a92c",
-                        "installed_by": ["modules"]
+                        "installed_by": ["modules"],
+                        "patch": "modules/nf-core/diamond/blastp/diamond-blastp.diff"
                     },
                     "diamond/blastx": {
                         "branch": "master",
                         "git_sha": "b29f6beb86d1d24d680277fb1a3f4de7b8b8a92c",
-                        "installed_by": ["modules"]
+                        "installed_by": ["modules"],
+                        "patch": "modules/nf-core/diamond/blastx/diamond-blastx.diff"
                     },
                     "fastawindows": {
                         "branch": "master",
@@ -64,6 +66,11 @@
                         "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a",
                         "installed_by": ["modules"]
                     },
+                    "pigz/compress": {
+                        "branch": "master",
+                        "git_sha": "0eab94fc1e48703c1b0a8704bd665f554905c39d",
+                        "installed_by": ["modules"]
+                    },
                     "samtools/fasta": {
                         "branch": "master",
                         "git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62",

diff --git a/modules/local/blobtoolkit/updatemeta.nf b/modules/local/blobtoolkit/updatemeta.nf
@@ -5,7 +5,7 @@ process BLOBTOOLKIT_UPDATEMETA {
     if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
         exit 1, "BLOBTOOLKIT_UPDATEMETA module does not support Conda. Please use Docker / Singularity / Podman instead."
     }
-    container "docker.io/pacificbiosciences/pyyaml:5.3.1"
+    container "docker.io/genomehubs/blobtoolkit:4.3.9"
 
     input:
     tuple val(meta), path(input)

diff --git a/modules/nf-core/blast/blastn/blast-blastn.diff b/modules/nf-core/blast/blastn/blast-blastn.diff
diff --git a/modules/nf-core/blast/blastn/main.nf b/modules/nf-core/blast/blastn/main.nf