diff --git a/README.md b/README.md index 2e398d5..7e33c1e 100644 --- a/README.md +++ b/README.md @@ -2,14 +2,13 @@ [![GitHub commits since latest release](https://img.shields.io/github/commits-since/plantinformatics/pretzel-input-generator/latest.svg?style=for-the-badge&logo=github)](https://github.com/plantinformatics/pretzel-input-generator/releases) -![GitHub Workflow Status](https://img.shields.io/github/workflow/status/plantinformatics/pretzel-input-generator/CI?label=CI%20TESTS&logo=github&style=for-the-badge) +[![GitHub Workflow Status](https://img.shields.io/github/workflow/status/plantinformatics/pretzel-input-generator/CI?label=CI%20TESTS&logo=github&style=for-the-badge)](https://github.com/plantinformatics/pretzel-input-generator/actions) -**Note that this README is partly out-of-date ** - [Pipeline overview](#pipeline-overview) - [Default pipeline](#default-pipeline) - [Quick start example using microsporidia data](#quick-start-example-using-microsporidia-data) - - [Input specification (triticeae and other relevant data sets)](#input-specification-triticeae-and-other-relevant-data-sets) + - [Input specification](#input-specification) - [Disparate triticeae datasets](#disparate-triticeae-datasets) - [Dependencies](#dependencies) - [Execution](#execution) @@ -46,7 +45,7 @@ nextflow run plantinformatics/pretzel-input-generator \ This will pull and process data sets specified in [`conf/microsporidia.config`](conf/microsporidia.config) -## Input specification (triticeae and other relevant data sets) +## Input specification A mix of local and remote files can be specified - see [`conf/microsporidia.config`](conf/microsporidia.config) and the corresponding [`conf/test-data.config`](conf/test-data.config) @@ -97,13 +96,21 @@ Wherever possible the assembly files are used as input for the pipeline in their * [Minimap2](https://github.com/lh3/minimap2) - if placing markers * `jq` * `groovy` interpreter + * and who knows what else - try to stick to either docker or singularity When using Singularity or Docker, the required containers are specified in [`conf/containers.conf`](conf/containers.config) -and pulled by Nextflow as required. +and pulled by Nextflow as required, if singularity fails when trying to pull multiple container images simultaneously, run + +``` +nextflow run pull_containers.nf -profile singularity +``` + +which will pull the container images sequentially. + ## Execution -We provide several execution profiles, "locally" may mean a designated server or an interactive session on a cluster. By appending e.g. `-revision v1.1` to your command you can specify a release tag to run a specific revision. When re-running the pipeline after errors or changes use `-resume` to ensure only the necessary processes are re-run. +We provide several execution profiles, "locally" may mean a designated server or an interactive session on a cluster. By appending e.g. `-revision v2.0` to your command you can specify a release tag to run a specific revision of the pipeline. When re-running the pipeline after errors or changes use `-resume` to ensure only the necessary processes are re-run. Run locally with docker @@ -135,7 +142,7 @@ All generated JSON files generated by the pipeline are output to `results/JSON`. * `*_annotation.json.gz` - specifications of coordinates of features (genes) within blocks * In addition, for each (lexicographically ordered) pair of genome assemblies, the pipeline generates: * `*_aliases.json.gz` which specify links between features between the two genomes. -* `*_markers.json.gz` - placement of marker sequences as features within blocks +* `*_{markers,transcripts,cds,genomic}.json.gz` - placement of marker or other sequences as features within blocks The output files (hopefully) conform to the requirements of [pretzel data structure](https://github.com/plantinformatics/pretzel-data). @@ -155,7 +162,7 @@ To upload the generated data to your instance of pretzel, follow [these instruct # BUSCO-based pipeline -This approach is much simpler and at the same time computationally intensive. +This approach is much simpler and yet computationally very intensive. Its main advantage is that it dos not require gene annotations, all that is required is a set of genome assemblies. ![doc/dag-busco.png](doc/dag-busco.png) diff --git a/bin/gff_2_pretzel.py b/bin/gff_2_pretzel.py index f8fc153..153d8cf 100755 --- a/bin/gff_2_pretzel.py +++ b/bin/gff_2_pretzel.py @@ -57,10 +57,10 @@ def eprint(*args, **kwargs): #Add feature to block scope[chr].append(OrderedDict([ ("name", attributes["Name"]), - ("value", [ toks[3], toks[4] ]), + ("value", [ int(toks[3]), int(toks[4]) ]), ("evidence", OrderedDict([ - ("identity", attributes["identity"]), - ("coverage", attributes["coverage"]) + ("identity", float(attributes["identity"])), + ("coverage", float(attributes["coverage"])) ])) ])) diff --git a/conf/triticeae.config b/conf/triticeae.config index 6ba3cc0..3a97f08 100644 --- a/conf/triticeae.config +++ b/conf/triticeae.config @@ -21,7 +21,7 @@ params { species : "Brachypodium_distachyon", version : "v1.0", shortName : "Brachy", - pep : "ftp://ftp.ensemblgenomes.org/pub/plants/release-39/fasta/brachypodium_distachyon/pep/Brachypodium_distachyon.v1.0.pep.all.fa.gz", + pep : "ftp://ftp.ensemblgenomes.org/pub/plants/release-39/fasta/brachypodium_distachyon/pep/Brachypodium_distachyon.v1.0.pep.all.fa.gz", idx : "ftp://ftp.ensemblgenomes.org/pub/plants/release-39/fasta/brachypodium_distachyon/dna_index/Brachypodium_distachyon.v1.0.dna.toplevel.fa.gz.fai", // fasta : "ftp://ftp.ensemblgenomes.org/pub/plants/release-39/fasta/brachypodium_distachyon/dna/Brachypodium_distachyon.v1.0.dna.toplevel.fa.gz", source : "https://plants.ensembl.org/Brachypodium_distachyon", @@ -56,6 +56,16 @@ params { source : "https://www.ebi.ac.uk/ena/data/view/GCA_002575655.1 http://aegilops.wheat.ucdavis.edu/ATGSP/annotation/", citation : "Genome sequence of the progenitor of the wheat D-genome Aegilops tauschii. M C Luo et al. Nature 551, 498–502. doi:10.1038/nature24486" ], + [ + species : "Hordeum_vulgare", + version : "Hv_IBSC_PGSB_v2", + shortName : "Barley", + pep : "ftp://ftp.ensemblgenomes.org/pub/plants/release-39/fasta/hordeum_vulgare/pep/Hordeum_vulgare.Hv_IBSC_PGSB_v2.pep.all.fa.gz", + idx : "ftp://ftp.ensemblgenomes.org/pub/plants/release-39/fasta/hordeum_vulgare/dna_index/Hordeum_vulgare.Hv_IBSC_PGSB_v2.dna.toplevel.fa.gz.fai", + fasta : "ftp://ftp.ensemblgenomes.org/pub/plants/release-39/fasta/hordeum_vulgare/dna/Hordeum_vulgare.Hv_IBSC_PGSB_v2.dna.toplevel.fa.gz", + source : "https://plants.ensembl.org/Hordeum_vulgare", + citation : "https://doi.org/10.1038/nature22043" + ], [ species : "Hordeum_vulgare", cultivar: "Morex", @@ -90,6 +100,22 @@ params { source : "http://rice.plantbiology.msu.edu https://rapdb.dna.affrc.go.jp/", citation : "Improvement of the Oryza sativa Nipponbare reference genome using next generation sequence and optical map data. Yoshihiro Kawahara et al., Rice 6:4 2013." ], + // [ + // species : "Secale_cereale", + // version : "Lo7_2018v1p1p1", + // shortName : "Rye", + // pep : [ + // HC: "local/rye/Secale_cereale_Lo7_2018v1p1p1.pgsb.Feb2019.HC.aa.fasta", + // LC: "local/rye/Secale_cereale_Lo7_2018v1p1p1.pgsb.Feb2019.LC.aa.fasta" + // ], + // gtfgff3 : [ + // HC: "local/rye/Secale_cereale_Lo7_2018v1p1p1.pgsb.Feb2019.HC.gff3", + // LC: "local/rye/Secale_cereale_Lo7_2018v1p1p1.pgsb.Feb2019.LC.gff3" + // ], + // idx : "local/rye/Secale_cereale_Lo7_2018v1p1p1.pgsb.Feb2019.FAKE.len", + // source : "https://doi.ipk-gatersleben.de/DOI/8afb3971-b5e1-4748-8f0e-1b929ba73248/98b53069-746d-4b03-8f30-5757735bf9b9/2/1847940088", + // citation : "Chromosome-scale genome assembly provides insights into rye biology, evolution, and agronomic potential. https://doi.org/10.1101/2019.12.11.869693" + // ], [ species : "Triticum_aestivum", version : "IWGSC_RefSeq_v1.0", @@ -107,18 +133,18 @@ params { source : "https://wheat-urgi.versailles.inra.fr/Seq-Repository/Assemblies", citation : "The International Wheat Genome Sequencing Conosrtium. Shifting the limits in wheat research and breeding through a fully annotated and anchored reference genome sequence. Science. 2018. https://doi.org/10.1126/science.aar7191" ], - [ - species : "Triticum_aestivum", - version : "IWGSC_RefSeq_v2.0", - shortName : "IWGSCv2", - // pep : [ - best to wait for official annotations or place old using genomic sequences - // HC : "local/iwgsc_refseqv1.0_HighConf_REPR_CDS_2017Apr03_on_refseqv2.0.fasta", - // LC : "local/iwgsc_refseqv1.0_LowConf_REPR_CDS_2017Apr03_on_refseqv2.0.fasta", - // ], - idx : "local/iwgsc_refseqv2.0_all_chromosomes.fa.fai", - fasta : "local/iwgsc_refseqv2.0_all_chromosomes.fa", - source : "https://wheat-urgi.versailles.inra.fr/Seq-Repository/Assemblies", - ], + // [ + // species : "Triticum_aestivum", + // version : "IWGSC_RefSeq_v2.0", + // shortName : "IWGSCv2", + // // pep : [ - best to wait for official annotations or place old using genomic sequences + // // HC : "local/iwgsc_refseqv1.0_HighConf_REPR_CDS_2017Apr03_on_refseqv2.0.fasta", + // // LC : "local/iwgsc_refseqv1.0_LowConf_REPR_CDS_2017Apr03_on_refseqv2.0.fasta", + // // ], + // idx : "local/iwgsc_refseqv2.0_all_chromosomes.fa.fai", + // fasta : "local/iwgsc_refseqv2.0_all_chromosomes.fa", + // source : "https://wheat-urgi.versailles.inra.fr/Seq-Repository/Assemblies", + // ], // //10 wheats START ========================================== // [ // species : "Triticum_aestivum", @@ -200,6 +226,7 @@ params { // fasta : "local/10wheats/190524_spelt_pseudomolecules_v1.0.fasta.gz", // source : "http://www.10wheatgenomes.com/ https://wheat.ipk-gatersleben.de/", // ], + // // [ // // species : "Triticum_aestivum", // // version : "",