diff --git a/CMakeLists.txt b/CMakeLists.txt index 18449d4..491c7c0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -92,7 +92,7 @@ set(ARTIC_LINK_LIBRARIES ${HTS_LIB} ${ARTIC_LINK_LIBRARIES}) set(Boost_USE_STATIC_LIBS OFF) set(Boost_USE_MULTITHREADED ON) set(Boost_USE_STATIC_RUNTIME OFF) -find_package(Boost 1.45.0 REQUIRED) +find_package(Boost 1.45.0 COMPONENTS filesystem REQUIRED) # cli11, spdlog add_subdirectory(extlibs/CLI11) diff --git a/README.md b/README.md index 4a49328..6fb8bbf 100644 --- a/README.md +++ b/README.md @@ -14,9 +14,10 @@ This collection of tools is being worked on and more utility is due to be added. For now, we have: -* primer scheme validation -* alignment softmasking -* vcf filtering +- download primer schemes and reference sequences +- primer scheme validation +- alignment softmasking +- vcf filtering Read the [docs](https://artic-tools.readthedocs.io/en/latest/) for more info and checkout the [ARTIC pipeline](https://github.com/artic-network/fieldbioinformatics). diff --git a/app/artic-tools.cpp b/app/artic-tools.cpp index dc89210..e75cb95 100644 --- a/app/artic-tools.cpp +++ b/app/artic-tools.cpp @@ -29,6 +29,7 @@ int main(int argc, char** argv) // set up the subcommands app.require_subcommand(); CLI::App* softmaskCmd = app.add_subcommand("align_trim", "Trim alignments from an amplicon scheme"); + CLI::App* getterCmd = app.add_subcommand("get_scheme", "Download an ARTIC primer scheme and reference sequence"); CLI::App* validatorCmd = app.add_subcommand("validate_scheme", "Validate an amplicon scheme for compliance with ARTIC standards"); CLI::App* vcfFilterCmd = app.add_subcommand("check_vcf", "Check a VCF file based on primer scheme info and user-defined cut offs"); @@ -37,18 +38,16 @@ int main(int argc, char** argv) std::string primerSchemeFile; std::string outFileName; std::string refSeq; - int primerSchemeVersion = 3; + unsigned int primerSchemeVersion = 3; unsigned int minMAPQ = 15; unsigned int normalise = 100; bool primerStart = false; bool removeBadPairs = false; bool noReadGroups = false; bool verbose = false; - bool ignoreVersion = false; softmaskCmd->add_option("-b,--bamFile", bamFile, "The input bam file (will try STDIN if not provided)"); softmaskCmd->add_option("scheme", primerSchemeFile, "The ARTIC primer scheme")->required()->check(CLI::ExistingFile); softmaskCmd->add_option("--schemeVersion", primerSchemeVersion, "The ARTIC primer scheme version (default = 3)"); - softmaskCmd->add_flag("--noSchemeVersion", ignoreVersion, "Ignore the ARTIC primer scheme version"); softmaskCmd->add_option("--minMAPQ", minMAPQ, "A minimum MAPQ threshold for processing alignments (default = 15)"); softmaskCmd->add_option("--normalise", normalise, "Subsample to N coverage per strand (default = 100, deactivate with 0)"); softmaskCmd->add_option("--report", outFileName, "Output an align_trim report to file"); @@ -57,10 +56,16 @@ int main(int argc, char** argv) softmaskCmd->add_flag("--no-read-groups", noReadGroups, "Do not divide reads into groups in SAM output"); softmaskCmd->add_flag("--verbose", verbose, "Output debugging information to STDERR"); + // add gett options and flags + std::string getSchemeName; + unsigned int getSchemeVersion = 0; + getterCmd->add_option("scheme", getSchemeName, "The name of the scheme to download (ebola|nipah|scov2)")->required(); + getterCmd->add_option("--schemeVersion", getSchemeVersion, "The ARTIC primer scheme version (default = latest)"); + getterCmd->add_option("-o,--outDir", outFileName, "The directory to write the scheme and reference sequence to"); + // add validator options and flags validatorCmd->add_option("scheme", primerSchemeFile, "The primer scheme to validate")->required()->check(CLI::ExistingFile); validatorCmd->add_option("--schemeVersion", primerSchemeVersion, "The ARTIC primer scheme version (default = 3)"); - validatorCmd->add_flag("--noSchemeVersion", ignoreVersion, "Ignore the ARTIC primer scheme version"); validatorCmd->add_option("-o,--outputPrimerSeqs", outFileName, "If provided, will write primer sequences as multiFASTA (requires --refSeq to be provided)"); validatorCmd->add_option("-r,--refSeq", refSeq, "The reference sequence for the primer scheme (FASTA format)"); @@ -71,7 +76,6 @@ int main(int argc, char** argv) vcfFilterCmd->add_option("scheme", primerSchemeFile, "The primer scheme to use")->required()->check(CLI::ExistingFile); vcfFilterCmd->add_option("vcf", vcfIn, "The input VCF file to filter")->required()->check(CLI::ExistingFile); vcfFilterCmd->add_option("--schemeVersion", primerSchemeVersion, "The ARTIC primer scheme version (default = 3)"); - vcfFilterCmd->add_flag("--noSchemeVersion", ignoreVersion, "Ignore the ARTIC primer scheme version"); vcfFilterCmd->add_option("-o,--vcfOut", outFileName, "If provided, will write variants that pass checks"); vcfFilterCmd->add_flag("--dropPrimerVars", dropPrimerVars, "Will drop variants called within primer regions for the pool"); vcfFilterCmd->add_flag("--dropOverlapFails", dropOverlapFails, "Will drop variants called once within amplicon overlap regions"); @@ -79,21 +83,23 @@ int main(int argc, char** argv) // add the softmask callback softmaskCmd->callback([&]() { // load and check the primer scheme - auto ps = (ignoreVersion) ? artic::PrimerScheme(primerSchemeFile) : artic::PrimerScheme(primerSchemeFile, primerSchemeVersion); + auto ps = artic::PrimerScheme(primerSchemeFile, primerSchemeVersion); // setup and run the softmasker auto masker = artic::Softmasker(&ps, bamFile, userCmd.str(), minMAPQ, normalise, removeBadPairs, noReadGroups, primerStart, outFileName); masker.Run(verbose); }); + // add the getter callback + getterCmd->callback([&]() { + artic::DownloadScheme(getSchemeName, getSchemeVersion, outFileName); + }); + // add the validator callback validatorCmd->callback([&]() { - auto ps = (ignoreVersion) ? artic::PrimerScheme(primerSchemeFile) : artic::PrimerScheme(primerSchemeFile, primerSchemeVersion); + auto ps = artic::PrimerScheme(primerSchemeFile, primerSchemeVersion); std::cout << "primer scheme file:\t" << primerSchemeFile << std::endl; - if (!ignoreVersion) - std::cout << "primer scheme version:\t" << ps.GetVersion() << std::endl; - else - std::cout << "primer scheme version:\tunversioned" << std::endl; + std::cout << "primer scheme version:\t" << ps.GetVersion() << std::endl; std::cout << "reference sequence ID:\t" << ps.GetReferenceName() << std::endl; std::cout << "number of pools:\t" << ps.GetPrimerPools().size() << std::endl; std::cout << "number of primers:\t" << ps.GetNumPrimers() << " (includes " << ps.GetNumAlts() << " alts)" << std::endl; @@ -130,7 +136,7 @@ int main(int argc, char** argv) // add the vcfFilter callback vcfFilterCmd->callback([&]() { - auto ps = (ignoreVersion) ? artic::PrimerScheme(primerSchemeFile) : artic::PrimerScheme(primerSchemeFile, primerSchemeVersion); + auto ps = artic::PrimerScheme(primerSchemeFile, primerSchemeVersion); // setup and run the softmasker auto filter = artic::VcfChecker(&ps, vcfIn, outFileName, dropPrimerVars, dropOverlapFails); diff --git a/artic/primerScheme.cpp b/artic/primerScheme.cpp index d17a5f8..c05f148 100644 --- a/artic/primerScheme.cpp +++ b/artic/primerScheme.cpp @@ -13,8 +13,6 @@ const std::string LEFT_PRIMER_TAG = "_LEFT"; const std::string RIGHT_PRIMER_TAG = "_RIGHT"; const std::string ALT_PRIMER_TAG = "_alt"; const std::string NO_POOL = "unmatched"; -const unsigned int NUM_ARTIC_PRIMERS_v1_v2 = 196; -const unsigned int NUM_ARTIC_PRIMERS_v3 = 218; // Primer constructor. artic::Primer::Primer(unsigned int start, unsigned int end, const std::string primerID, size_t poolID) @@ -103,33 +101,10 @@ const std::string artic::Primer::GetSeq(faidx_t* reference, const std::string& r } // PrimerScheme constructor. -artic::PrimerScheme::PrimerScheme(const std::string& inputFile) -{ - _loadScheme(inputFile); - _validateScheme(); -} - -// PrimerScheme constructor with version checking. artic::PrimerScheme::PrimerScheme(const std::string& inputFile, unsigned int schemeVersion) : _version(schemeVersion) { - - // check the version provided - if (_version < 1 || _version > 3) - throw std::runtime_error("unrecognised primer scheme version - " + std::to_string(_version)); - - // load the scheme _loadScheme(inputFile); - - // check primer count matches scheme version - if (_version < 3) - if (_numPrimers != NUM_ARTIC_PRIMERS_v1_v2) - throw std::runtime_error("primer count does not equal the number required by the scheme version - " + std::to_string(_numPrimers) + " vs " + std::to_string(NUM_ARTIC_PRIMERS_v1_v2)); - if (_version == 3) - if (_numPrimers != NUM_ARTIC_PRIMERS_v3) - throw std::runtime_error("primer count does not equal the number required by the scheme version - " + std::to_string(_numPrimers) + " vs " + std::to_string(NUM_ARTIC_PRIMERS_v3)); - - // check the scheme _validateScheme(); } diff --git a/artic/primerScheme.hpp b/artic/primerScheme.hpp index 9214e78..1ad8694 100644 --- a/artic/primerScheme.hpp +++ b/artic/primerScheme.hpp @@ -9,6 +9,9 @@ namespace artic { + // DownloadScheme will download a specified primer scheme and the reference sequence. + void DownloadScheme(const std::string& schemeName, unsigned int requestedVersion, const std::string& outDir); + class Primer; class PrimerScheme; class Amplicon; @@ -75,7 +78,6 @@ namespace artic { public: // PrimerScheme constructors and destructor. - PrimerScheme(const std::string& inputFile); PrimerScheme(const std::string& inputFile, unsigned int schemeVersion); ~PrimerScheme(void); diff --git a/artic/primerSchemeDownloader.cpp b/artic/primerSchemeDownloader.cpp new file mode 100644 index 0000000..e44ecac --- /dev/null +++ b/artic/primerSchemeDownloader.cpp @@ -0,0 +1,110 @@ +#include +#include +#include +#include + +#include "log.hpp" +#include "primerScheme.hpp" + +// ARTIC scheme downloads +// Note: it would be nice to have all these in one place and under a unified naming system... +const std::vector ebovUrls = {"https://github.com/artic-network/primer-schemes/raw/master/ZaireEbola/V1/ZaireEbola.scheme.bed", "https://github.com/artic-network/primer-schemes/raw/master/ZaireEbola/V2/ZaireEbola.scheme.bed", "https://github.com/artic-network/primer-schemes/raw/master/ZaireEbola/V3/Ebov-10-Pan.bed"}; +const std::vector nivUrls = {"https://github.com/artic-network/primer-schemes/raw/master/Nipah/V1/NiV_6_Malaysia.bed"}; +const std::vector scovUrls = {"https://github.com/artic-network/artic-ncov2019/raw/master/primer_schemes/nCoV-2019/V1/nCoV-2019.scheme.bed", "https://github.com/artic-network/artic-ncov2019/raw/master/primer_schemes/nCoV-2019/V2/nCoV-2019.scheme.bed", "https://github.com/artic-network/artic-ncov2019/raw/master/primer_schemes/nCoV-2019/V3/nCoV-2019.scheme.bed"}; + +const std::vector ebovRefUrls = {"https://github.com/artic-network/primer-schemes/raw/master/ZaireEbola/V1/ZaireEbola.reference.fasta", "https://github.com/artic-network/primer-schemes/raw/master/ZaireEbola/V2/ZaireEbola.reference.fasta", "https://github.com/artic-network/primer-schemes/raw/master/ZaireEbola/V3/Ebov-10-Pan.fasta"}; +const std::vector nivRefUrls = {"https://github.com/artic-network/primer-schemes/raw/master/Nipah/V1/NiV_6_Malaysia.fasta"}; +const std::vector scovRefUrls = {"https://github.com/artic-network/artic-ncov2019/raw/master/primer_schemes/nCoV-2019/V1/nCoV-2019.reference.fasta", "https://github.com/artic-network/artic-ncov2019/raw/master/primer_schemes/nCoV-2019/V2/nCoV-2019.reference.fasta", "https://github.com/artic-network/artic-ncov2019/raw/master/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta"}; + +enum Scheme +{ + InvalidScheme, + Ebov, + Nipah, + Scov2, +}; + +Scheme resolveScheme(std::string query) +{ + static const std::map schemeStrings{ + {"ebola", Ebov}, + {"nipah", Nipah}, + {"scov2", Scov2}, + }; + auto itr = schemeStrings.find(query); + if (itr != schemeStrings.end()) + return itr->second; + return InvalidScheme; +} + +// DownloadScheme will download a specified primer scheme and the reference sequence. +// if 0 is provided as a version, or an unknown version provided, the latest scheme version will be used. +void artic::DownloadScheme(const std::string& schemeName, unsigned int requestedVersion, const std::string& outDir) +{ + std::string schemeURL; + std::string refURL; + unsigned int version = requestedVersion; + switch (resolveScheme(schemeName)) + { + case Ebov: + if (version == 0 || version > ebovUrls.size()) + version = ebovUrls.size(); + schemeURL = ebovUrls.at(version - 1); + refURL = ebovRefUrls.at(version - 1); + break; + case Nipah: + if (version == 0 || version > nivUrls.size()) + version = nivUrls.size(); + schemeURL = nivUrls.at(version - 1); + refURL = nivRefUrls.at(version - 1); + break; + case Scov2: + if (version == 0 || version > scovUrls.size()) + version = scovUrls.size(); + schemeURL = scovUrls.at(version - 1); + refURL = scovRefUrls.at(version - 1); + break; + default: + throw std::runtime_error("unknown scheme: " + schemeName); + } + artic::Log::Init("getter"); + LOG_TRACE("starting scheme getter"); + LOG_TRACE("\tscheme: {}", schemeName); + LOG_TRACE("\tversion: {}", requestedVersion); + if ((requestedVersion != version) && (requestedVersion != 0)) + LOG_WARN("requested scheme version not found, using latest version instead (v{})", version); + + // create filenames + std::stringstream of1; + std::stringstream of2; + if (outDir.size() != 0) + { + if (!boost::filesystem::is_directory(outDir) || !boost::filesystem::exists(outDir)) + boost::filesystem::create_directory(outDir); + of1 << outDir << "/"; + of2 << outDir << "/"; + } + of1 << schemeName << ".v" << version << ".scheme.bed"; + of2 << schemeName << ".v" << version << ".reference.fasta"; + + // download using wget sys call + LOG_TRACE("downloading data") + std::stringstream baseCmd; + baseCmd << "wget -q -O "; + baseCmd << schemeName << ".v" << version; + std::string cmd1 = "wget -q -O " + of1.str() + " " + schemeURL + " 2>/dev/null"; + std::string cmd2 = "wget -q -O " + of2.str() + " " + refURL + " 2>/dev/null"; + int res; + res = system(cmd1.c_str()); + if (res != EXIT_SUCCESS) + throw std::runtime_error("could not download scheme with wget"); + res = system(cmd2.c_str()); + if (res != EXIT_SUCCESS) + throw std::runtime_error("could not download reference with wget"); + LOG_TRACE("\t{}", of1.str()); + LOG_TRACE("\t{}", of2.str()); + + // validate the scheme + LOG_TRACE("checking scheme") + auto ps = artic::PrimerScheme(of1.str(), version); +} diff --git a/artic/vcfCheck.cpp b/artic/vcfCheck.cpp index af1c2b9..c2051bd 100644 --- a/artic/vcfCheck.cpp +++ b/artic/vcfCheck.cpp @@ -43,7 +43,6 @@ artic::VcfChecker::~VcfChecker(void) // Run will perform the softmasking on the open BAM file. void artic::VcfChecker::Run() { - std::cout << _outfileName << std::endl; artic::Log::Init("vcfchecker"); LOG_TRACE("starting VCF checker"); if ((_outfileName.size() != 0)) diff --git a/docs/commands.md b/docs/commands.md index 6c515b4..2adabaa 100644 --- a/docs/commands.md +++ b/docs/commands.md @@ -12,6 +12,16 @@ Example usage: artic-tools align_trim -b in.bam primerscheme.bed > out.bam 2> out.log ``` +## get_scheme + +The `get_scheme` command can download primer schemes and sequences for several ARTIC references. + +Example usage: + +``` +artic-tools get_scheme nipah +``` + ## validate_scheme The `validate_scheme` command is used to check your primer scheme confirms to an ARTIC standard and can be used in our pipelines. @@ -69,10 +79,10 @@ Example ouput: [14:13:50] artic-tools::vcfchecker: nothing seen at position yet, holding var [14:13:50] artic-tools::vcfchecker: variant at pos 22868: TG->T [14:13:50] artic-tools::vcfchecker: located within an amplicon overlap region -[14:13:50] artic-tools::vcfchecker: var pos does not match with that of previously identified overlap, holding var (and dropping held var at 22862) +[14:13:50] artic-tools::vcfchecker: var pos does not match with that of previously identified overlap var, holding new var (and dropping held var at 22862) [14:13:50] artic-tools::vcfchecker: variant at pos 22896: T->TTGG [14:13:50] artic-tools::vcfchecker: located within an amplicon overlap region -[14:13:50] artic-tools::vcfchecker: var pos does not match with that of previously identified overlap, holding var (and dropping held var at 22867) +[14:13:50] artic-tools::vcfchecker: var pos does not match with that of previously identified overlap var, holding new var (and dropping held var at 22867) [14:13:50] artic-tools::vcfchecker: variant at pos 22909: TA->T [14:13:50] artic-tools::vcfchecker: variant at pos 22913: T->C [14:13:50] artic-tools::vcfchecker: variant at pos 22916: CT->TC @@ -85,7 +95,7 @@ Example ouput: [14:13:50] artic-tools::vcfchecker: variant at pos 23098: AC->GT [14:13:50] artic-tools::vcfchecker: variant at pos 23183: T->TC [14:13:50] artic-tools::vcfchecker: located within an amplicon overlap region -[14:13:50] artic-tools::vcfchecker: var pos does not match with that of previously identified overlap, holding var (and dropping held var at 22895) +[14:13:50] artic-tools::vcfchecker: var pos does not match with that of previously identified overlap var, holding new var (and dropping held var at 22895) [14:13:50] artic-tools::vcfchecker: variant at pos 23403: A->G [14:13:50] artic-tools::vcfchecker: variant at pos 27752: C->T [14:13:50] artic-tools::vcfchecker: variant at pos 28881: GGG->AAC diff --git a/tests/primerScheme_test.cpp b/tests/primerScheme_test.cpp index 89af487..e355274 100644 --- a/tests/primerScheme_test.cpp +++ b/tests/primerScheme_test.cpp @@ -35,21 +35,6 @@ TEST(primerscheme, constructor) FAIL() << "expected std::runtime_error"; } - // catch bad version - try - { - artic::PrimerScheme(inputScheme, 666); - FAIL() << "expected a no file error"; - } - catch (std::runtime_error& err) - { - EXPECT_EQ(err.what(), std::string("unrecognised primer scheme version - 666")); - } - catch (...) - { - FAIL() << "expected std::runtime_error"; - } - // constructor should pass try {