Skip to content

Commit

Permalink
Merge pull request #1076 from jbv2/circularmapper
Browse files Browse the repository at this point in the history
DSL2: Starting subworkflow for circularmapper
  • Loading branch information
jfy133 authored Aug 16, 2024
2 parents 0ceda2c + 81784cb commit d63da95
Show file tree
Hide file tree
Showing 23 changed files with 858 additions and 134 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
- "-profile test,docker --preprocessing_tool adapterremoval --preprocessing_adapterlist 'https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/adapterremoval/adapterremoval_adapterlist.txt' --sequencing_qc_tool falco --run_genotyping --genotyping_tool 'freebayes' --genotyping_source 'raw'"
- "-profile test,docker --mapping_tool bwamem --run_mapdamage_rescaling --run_pmd_filtering --run_trim_bam --run_genotyping --genotyping_tool 'ug' --genotyping_source 'trimmed'"
- "-profile test,docker --mapping_tool bowtie2 --damagecalculation_tool mapdamage --damagecalculation_mapdamage_downsample 100 --run_genotyping --genotyping_tool 'hc' --genotyping_source 'raw'"
- "-profile test,docker --skip_preprocessing --convert_inputbam"
- "-profile test,docker --mapping_tool circularmapper --skip_preprocessing --convert_inputbam --fasta_circular_target 'NC_007596.2' --fasta_circularmapper_elongationfactor 500"
- "-profile test_humanbam,docker --run_mtnucratio --run_contamination_estimation_angsd --snpcapture_bed 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/reference/Human/1240K.pos.list_hs37d5.0based.bed.gz' --run_genotyping --genotyping_tool 'pileupcaller' --genotyping_source 'raw'"
- "-profile test_humanbam,docker --run_sexdeterrmine --run_genotyping --genotyping_tool 'angsd' --genotyping_source 'raw'"
- "-profile test_multiref,docker" ## TODO add damage manipulation here instead once it goes multiref
Expand Down
4 changes: 4 additions & 0 deletions CITATIONS.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,10 @@

> Sex.DetERRmine.py Lamnidis, T.C. et al., 2018. Ancient Fennoscandian genomes reveal origin and spread of Siberian ancestry in Europe. Nature communications, 9(1), p.5018. Available at: http://dx.doi.org/10.1038/s41467-018-07483-5. Download: https://github.com/TCLamnidis/Sex.DetERRmine
- [CircularMapper](https://doi.org/10.1186/s13059-016-0918-z)

> Peltzer, A., Jäger, G., Herbig, A., Seitz, A., Kniep, C., Krause, J., & Nieselt, K. (2016). EAGER: efficient ancient genome reconstruction. Genome Biology, 17(1), 1–14. doi: [10.1186/s13059-016-0918-z](https://doi.org/10.1186/s13059-016-0918-z)
## Software packaging/containerisation tools

- [Anaconda](https://anaconda.com)
Expand Down
16 changes: 15 additions & 1 deletion assets/schema_fasta.json
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,21 @@
"circular_target": {
"type": "string",
"pattern": "^\\S+$",
"errorMessage": "The headers of the chromosome to be extended by circularmapper must not contain any spaces and no leading '>'."
"errorMessage": "The headers of the chromosome extended by circulargenerator must not contain any spaces and no leading '>'."
},
"circularmapper_elongatedfasta": {
"type": "string",
"format": "file-path",
"pattern": "^\\S+\\.f(na|asta|a|as)(\\.gz)?$",
"exists": true,
"errorMessage": "The elongated Fasta files for the mapping reference must be provided with file extensions '.fasta', '.fa', '.fas', '.fna', '.fasta.gz','.fa.gz','.fas.gz', '.fna.gz' and cannot contain any spaces."
},
"circularmapper_elongatedindex": {
"type": "string",
"format": "directory-path",
"pattern": "^\\S+$",
"exists": true,
"errorMessage": "The directories of the index files for the elongated mapping reference for circularmapper must not contain any spaces and have file extensions ''."
},
"mitochondrion_header": {
"type": "string",
Expand Down
76 changes: 69 additions & 7 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,35 @@ process {
]
}

// Reference elongation and indexing for circular mapping
withName: GUNZIP_ELONGATED_FASTA {
publishDir = [
path: { "${params.outdir}/reference/${meta.id}_${params.fasta_circularmapper_elongationfactor}/" },
mode: params.publish_dir_mode,
pattern: '*_*[0-9].f*',
enabled: params.save_reference
]
}

withName: CIRCULARMAPPER_CIRCULARGENERATOR {
tag = { "${meta.id}_${params.fasta_circularmapper_elongationfactor}" }
publishDir = [
path: { "${params.outdir}/reference/${meta.id}_${params.fasta_circularmapper_elongationfactor}/" },
mode: params.publish_dir_mode,
pattern: '*_*[0-9].fasta',
enabled: params.save_reference
]
}

withName: BWA_INDEX_CIRCULARISED {
publishDir = [
path: { "${params.outdir}/reference/${meta.id}_${params.fasta_circularmapper_elongationfactor}/" },
mode: params.publish_dir_mode,
pattern: 'bwa',
enabled: params.save_reference
]
}

//
// BAM INPUT
//
Expand All @@ -294,6 +323,9 @@ process {

withName: SAMTOOLS_INDEX_BAM_INPUT {
tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}_L${meta.lane}" }
publishDir = [
enabled: false
]
}

//
Expand Down Expand Up @@ -404,7 +436,7 @@ process {
withName: BWA_ALN {
tag = { "${meta.id_index}|${meta.sample_id}_${meta.library_id}_L${meta.lane}" }
ext.args = { "-n ${params.mapping_bwaaln_n} -k ${params.mapping_bwaaln_k} -l ${params.mapping_bwaaln_l} -o ${params.mapping_bwaaln_o}" }
ext.prefix = { "${meta.sample_id}_${meta.library_id}_L${meta.lane}_${meta.reference}" }
ext.prefix = { "${meta.sample_id}_${meta.library_id}_L${meta.lane}_${meta.id_index}" }
publishDir = [
enabled: false
]
Expand All @@ -417,7 +449,7 @@ process {
[
"-r '@RG\\tID:ILLUMINA-${meta.sample_id}_${meta.library_id}\\tSM:${meta.sample_id}\\tLB:${meta.library_id}\\tPL:illumina\\tPU:ILLUMINA-${meta.library_id}-${meta.strandedness}_stranded-${se_pe_string}'"
].join(' ').trim() }
ext.prefix = { "${meta.sample_id}_${meta.library_id}_L${meta.lane}_${meta.reference}" }
ext.prefix = { "${meta.sample_id}_${meta.library_id}_L${meta.lane}_${meta.id_index}" }
publishDir = [
enabled: false
]
Expand Down Expand Up @@ -502,7 +534,7 @@ process {
tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" }
ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}_sorted" }
publishDir = [
path: { "${params.outdir}/mapping/" },
path: { "${params.outdir}/mapping/${params.mapping_tool}/" },
mode: params.publish_dir_mode,
pattern: '*.{bam}'
]
Expand All @@ -513,22 +545,52 @@ process {
ext.args = { params.fasta_largeref ? "-c" : "" }
ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" }
publishDir = [
path: { "${params.outdir}/mapping/" },
path: { "${params.outdir}/mapping/${params.mapping_tool}/" },
mode: params.publish_dir_mode,
pattern: '*.{bai,csi}'
]
}

withName: SAMTOOLS_FLAGSTAT_MAPPED {
withName: SAMTOOLS_FLAGSTAT_MERGED_LANES {
tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}" }
ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}" }
ext.prefix = { "${meta.sample_id}_${meta.library_id}_${meta.reference}_sorted" }
publishDir = [
path: { "${params.outdir}/mapping/" },
path: { "${params.outdir}/mapping/${params.mapping_tool}/" },
mode: params.publish_dir_mode,
pattern: '*.flagstat'
]
}

// Circular mapping
// Configuration for BWA_ALN and BWA_SAMSE/SAMPE is the same as for the non-circular mapping
withName: CIRCULARMAPPER_REALIGNSAMFILE {
tag = { "${meta.reference}|${meta.sample_id}_${meta.library_id}_L${meta.lane}" }
ext.args = { params.mapping_circularmapper_circularfilter ? "-f true -x true" : "" }
ext.prefix = { "${meta.sample_id}_${meta.library_id}_L${meta.lane}_${meta.reference}" }
publishDir = [
enabled: false
]
}

withName: ".*MAP:CIRCULARMAPPER:FASTQ_ALIGN_BWAALN_ELONGATED:SAMTOOLS_INDEX" {
tag = { "${meta.id_index}|${meta.sample_id}_${meta.library_id}_L${meta.lane}" }
ext.args = { params.fasta_largeref ? "-c" : "" }
ext.prefix = { "${meta.sample_id}_${meta.library_id}_L${meta.lane}_${meta.reference}" }
publishDir = [
enabled: false
]
}

withName: ".*MAP:CIRCULARMAPPER:SAMTOOLS_INDEX_REALIGNED" {
tag = { "${meta.id_index}|${meta.sample_id}_${meta.library_id}_L${meta.lane}" }
ext.args = { params.fasta_largeref ? "-c" : "" }
ext.prefix = { "${meta.sample_id}_${meta.library_id}_L${meta.lane}_${meta.reference}" }
publishDir = [
enabled: false
]

}

//
// DEDUPLICATION
//
Expand Down
32 changes: 32 additions & 0 deletions docs/development/manual_tests.md
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,38 @@ nextflow run ../main.nf -profile test,singularity --outdir ./results -resume -du

```

### CircularMapper

```bash
## CircularMapper with reference elongation
## Expect: Reference elongation is ran, and circularmapper SWF is ran.
## Check: Expect the elongated reference and BWA index directory within the `reference` directory. Also 2 bam files together with their BAIs and Flagstats in the `mapping/circularmapper` directory.
nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -dump-channels -ansi-log false --fasta_circular_target 'NC_007596.2' --mapping_tool circularmapper --save_reference
```

```bash
## CircularMapper with an already elongated reference. Big reference flag. Also check that bwa_aln flags also propagate when using circularmapper.
## Expect: Reference elongation is NOT ran, and circularmapper SWF is ran.
## Check: 2 bam files together with their CSIs and Flagstats in the `mapping/circularmapper` directory.
## Also check the .command.sh for the -k and -n flags during BWA ALN.
nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -dump-channels -ansi-log false --fasta_circular_target 'NC_007596.2' --mapping_tool circularmapper --fasta_circularmapper_elongatedfasta data/reference/Mammoth_MT_Krause_500/Mammoth_MT_Krause_500.fasta --fasta_circularmapper_elongatedindex data/reference/Mammoth_MT_Krause_500/bwa --fasta_largeref --mapping_bwaaln_n 0.05 --mapping_bwaaln_k 3
```

```bash
## Multiref with circularmapper. reference_sheet_multiref.csv edited to include elongated reference and index from first CM manual test for Mammoth_MT.
## Expect: No elongation for Mammoth MT. Elongation for hs37d5_chr21-MT reference.
## Check: 2 bam files together with their CSIs and Flagstats in the `mapping/circularmapper` directory PER REFERENCE (3 libraries (from 2 samples) x 2 references x 3 files = 18 files total).
## Also, elongated hs37d5_chr21-MT is not saved, since --save_reference was not specified. But it did get elongated.
nextflow run main.nf -profile test_multiref,docker --outdir ./results -w work/ -resume -dump-channels -ansi-log false --fasta_sheet /Users/lamnidis/Software/github/jbv2/eager/data/reference/reference_sheet_multiref.csv --mapping_tool circularmapper --fasta_largeref --mapping_bwaaln_n 0.05 --mapping_bwaaln_k 3
```

```bash
## Circularmapper with circularfilter, with a provided elongated reference.
## Expect: No elongation for Mammoth MT.
## Check: 2 bam files together with their CSIs and Flagstats in the `mapping/circularmapper` directory. (6 files total). Ensure files have the @SQ tag of the circular choromosome.
nextflow run main.nf -profile test,docker --outdir ./results -w work/ -resume -dump-channels -ansi-log false --fasta_circular_target 'NC_007596.2' --mapping_tool circularmapper --fasta_circularmapper_elongatedfasta data/reference/Mammoth_MT_Krause_500/Mammoth_MT_Krause_500.fasta --fasta_circularmapper_elongatedindex data/reference/Mammoth_MT_Krause_500/bwa --fasta_largeref --mapping_bwaaln_n 0.05 --mapping_bwaaln_k 3 --mapping_circularmapper_circularfilter
```

## Host Removal

All possible parameters
Expand Down
38 changes: 36 additions & 2 deletions docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,25 @@ Depending on what is supplied by the user, and if `--save_reference` is supplied

It is highly recommend to move these files to a central location or cache directory on your machine to facilitate resume of the indices across different pipeline runs. In many cases indexing the reference genome for alignment can be the longest step of a pipeline run, therefore re-using indices in future runs (supplied to the pipeline with flags such as `--fasta_fai`, `--fasta_dict`, etc. or added to the reference sheet provided to `--fasta`) can greatly speed up analyses on other samples.

#### Reference Elongation

<details markdown="1">
<summary>Output files</summary>

- `reference/`
- `<reference_id>_<elongation_factor>/`
- `*.{fasta,fna,fa,fa}`: Uncompressed input FASTA file (if supplied to pipeline gzipped).
- `bwa/`:
- `*.fasta.{amb,ann,bwt,pac,sa}`: BWA aligner(s) reference index files from `bwa index`.

</details>

Mapping with `circularmapper` requires an elongated reference built by [CircularMapper/CircularGenerator](https://github.com/apeltzer/CircularMapper). CircularGenerator elongates the `--fasta_circular_target` of a supplied reference genome fasta by the number of base pairs specified in `--fasta_circularmapper_elongationfactor`.

Depending on what is supplied by the user, and if `--save_reference` is supplied, this directory will contain the elongated reference fasta, as well as its corresponding bwa reference index files.

It is highly recommend to move these files to a central location or cache directory on your machine to facilitate resume of the indices across different pipeline runs. In many cases indexing the reference genome for alignment can be the longest step of a pipeline run, therefore re-using indices in future runs (supplied to the pipeline with flags such as `--fasta_circularmapper_elongatedfasta`, `--fasta_circularmapper_elongatedindex`, etc. or added to the reference sheet provided to `--fasta`) can greatly speed up analyses on other samples.

### Preprocessing

#### Falco
Expand Down Expand Up @@ -161,7 +180,7 @@ The resulting FASTQ files will only be present in your results directory if you
<details markdown="1">
<summary>Output files</summary>

- `mapping/`
- `mapping/bwa{aln,mem}/`

- `*.bam`: Sorted reads aligned against a reference genome in BAM format with no additional filtering.
- `*.{bai,csi}`: Index file corresponding to a BAM file which is for faster downstream steps (e.g. SAMtools).
Expand All @@ -176,7 +195,7 @@ The resulting FASTQ files will only be present in your results directory if you
<details markdown="1">
<summary>Output files</summary>

- `mapping/`
- `mapping/bowtie2/`

- `*.bam`: Sorted reads aligned against a reference genome in BAM format with no additional filtering.
- `*.{bai,csi}`: Index file corresponding to a BAM file which is for faster downstream steps (e.g. SAMtools).
Expand All @@ -186,6 +205,21 @@ The resulting FASTQ files will only be present in your results directory if you

[Bowtie 2](https://bowtie-bio.sourceforge.net/bowtie2/manual.shtml) is an ultrafast and memory-efficient tool for aligning sequencing reads to long reference sequences. It is particularly good at aligning reads of about 50 up to 100s of characters to relatively long (e.g. mammalian) genomes. Bowtie 2 indexes the genome with an FM Index (based on the Burrows-Wheeler Transform or BWT) to keep its memory footprint small and supports gapped, local, and paired-end alignment modes.

#### CircularMapper

<details markdown="1">
<summary>Output files</summary>

- `mapping/circularmapper/`

- `*.bam`: Sorted reads aligned against an elongated reference genome in BAM format with no additional filtering.
- `*.{bai,csi}`: Index file corresponding to a BAM file which is for faster downstream steps (e.g. SAMtools).
- `*.flagstat`: Statistics of aligned reads from SAMtools `flagstat`.

</details>

[CircularMapper RealignSAMFile](https://github.com/apeltzer/CircularMapper/tree/master) is an extension to `bwa aln` for realigning reads mapped to circularised contigs. First, an elogated/circularised reference is built using CircularGenerator, then reads are mapped to this reference using BWA ALN. The resulting BAM file is then realigned using CircularMapper RealignSAMFile. The reference coordinates of this BAM file have been adjusted to those of the original reference genome (prior to elongation).

### Host Removal

<details markdown="1">
Expand Down
10 changes: 10 additions & 0 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,16 @@
"git_sha": "02fd5bd7275abad27aad32d5c852e0a9b1b98882",
"installed_by": ["modules"]
},
"circularmapper/circulargenerator": {
"branch": "master",
"git_sha": "a7b0131370d9bc38076efad88773bca5537203d0",
"installed_by": ["modules"]
},
"circularmapper/realignsamfile": {
"branch": "master",
"git_sha": "a7b0131370d9bc38076efad88773bca5537203d0",
"installed_by": ["modules"]
},
"damageprofiler": {
"branch": "master",
"git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

61 changes: 61 additions & 0 deletions modules/nf-core/circularmapper/circulargenerator/main.nf

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit d63da95

Please sign in to comment.