Skip to content

Commit

Permalink
Merge pull request #4 from will-rowe/downloader
Browse files Browse the repository at this point in the history
adding get_scheme subcommand
  • Loading branch information
Will Rowe authored Aug 22, 2020
2 parents 7b4f88a + 0be6d62 commit a074397
Show file tree
Hide file tree
Showing 9 changed files with 149 additions and 61 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ set(ARTIC_LINK_LIBRARIES ${HTS_LIB} ${ARTIC_LINK_LIBRARIES})
set(Boost_USE_STATIC_LIBS OFF)
set(Boost_USE_MULTITHREADED ON)
set(Boost_USE_STATIC_RUNTIME OFF)
find_package(Boost 1.45.0 REQUIRED)
find_package(Boost 1.45.0 COMPONENTS filesystem REQUIRED)

# cli11, spdlog
add_subdirectory(extlibs/CLI11)
Expand Down
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@

This collection of tools is being worked on and more utility is due to be added. For now, we have:

* primer scheme validation
* alignment softmasking
* vcf filtering
- download primer schemes and reference sequences
- primer scheme validation
- alignment softmasking
- vcf filtering

Read the [docs](https://artic-tools.readthedocs.io/en/latest/) for more info and checkout the [ARTIC pipeline](https://github.com/artic-network/fieldbioinformatics).

Expand Down
30 changes: 18 additions & 12 deletions app/artic-tools.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ int main(int argc, char** argv)
// set up the subcommands
app.require_subcommand();
CLI::App* softmaskCmd = app.add_subcommand("align_trim", "Trim alignments from an amplicon scheme");
CLI::App* getterCmd = app.add_subcommand("get_scheme", "Download an ARTIC primer scheme and reference sequence");
CLI::App* validatorCmd = app.add_subcommand("validate_scheme", "Validate an amplicon scheme for compliance with ARTIC standards");
CLI::App* vcfFilterCmd = app.add_subcommand("check_vcf", "Check a VCF file based on primer scheme info and user-defined cut offs");

Expand All @@ -37,18 +38,16 @@ int main(int argc, char** argv)
std::string primerSchemeFile;
std::string outFileName;
std::string refSeq;
int primerSchemeVersion = 3;
unsigned int primerSchemeVersion = 3;
unsigned int minMAPQ = 15;
unsigned int normalise = 100;
bool primerStart = false;
bool removeBadPairs = false;
bool noReadGroups = false;
bool verbose = false;
bool ignoreVersion = false;
softmaskCmd->add_option("-b,--bamFile", bamFile, "The input bam file (will try STDIN if not provided)");
softmaskCmd->add_option("scheme", primerSchemeFile, "The ARTIC primer scheme")->required()->check(CLI::ExistingFile);
softmaskCmd->add_option("--schemeVersion", primerSchemeVersion, "The ARTIC primer scheme version (default = 3)");
softmaskCmd->add_flag("--noSchemeVersion", ignoreVersion, "Ignore the ARTIC primer scheme version");
softmaskCmd->add_option("--minMAPQ", minMAPQ, "A minimum MAPQ threshold for processing alignments (default = 15)");
softmaskCmd->add_option("--normalise", normalise, "Subsample to N coverage per strand (default = 100, deactivate with 0)");
softmaskCmd->add_option("--report", outFileName, "Output an align_trim report to file");
Expand All @@ -57,10 +56,16 @@ int main(int argc, char** argv)
softmaskCmd->add_flag("--no-read-groups", noReadGroups, "Do not divide reads into groups in SAM output");
softmaskCmd->add_flag("--verbose", verbose, "Output debugging information to STDERR");

// add gett options and flags
std::string getSchemeName;
unsigned int getSchemeVersion = 0;
getterCmd->add_option("scheme", getSchemeName, "The name of the scheme to download (ebola|nipah|scov2)")->required();
getterCmd->add_option("--schemeVersion", getSchemeVersion, "The ARTIC primer scheme version (default = latest)");
getterCmd->add_option("-o,--outDir", outFileName, "The directory to write the scheme and reference sequence to");

// add validator options and flags
validatorCmd->add_option("scheme", primerSchemeFile, "The primer scheme to validate")->required()->check(CLI::ExistingFile);
validatorCmd->add_option("--schemeVersion", primerSchemeVersion, "The ARTIC primer scheme version (default = 3)");
validatorCmd->add_flag("--noSchemeVersion", ignoreVersion, "Ignore the ARTIC primer scheme version");
validatorCmd->add_option("-o,--outputPrimerSeqs", outFileName, "If provided, will write primer sequences as multiFASTA (requires --refSeq to be provided)");
validatorCmd->add_option("-r,--refSeq", refSeq, "The reference sequence for the primer scheme (FASTA format)");

Expand All @@ -71,29 +76,30 @@ int main(int argc, char** argv)
vcfFilterCmd->add_option("scheme", primerSchemeFile, "The primer scheme to use")->required()->check(CLI::ExistingFile);
vcfFilterCmd->add_option("vcf", vcfIn, "The input VCF file to filter")->required()->check(CLI::ExistingFile);
vcfFilterCmd->add_option("--schemeVersion", primerSchemeVersion, "The ARTIC primer scheme version (default = 3)");
vcfFilterCmd->add_flag("--noSchemeVersion", ignoreVersion, "Ignore the ARTIC primer scheme version");
vcfFilterCmd->add_option("-o,--vcfOut", outFileName, "If provided, will write variants that pass checks");
vcfFilterCmd->add_flag("--dropPrimerVars", dropPrimerVars, "Will drop variants called within primer regions for the pool");
vcfFilterCmd->add_flag("--dropOverlapFails", dropOverlapFails, "Will drop variants called once within amplicon overlap regions");

// add the softmask callback
softmaskCmd->callback([&]() {
// load and check the primer scheme
auto ps = (ignoreVersion) ? artic::PrimerScheme(primerSchemeFile) : artic::PrimerScheme(primerSchemeFile, primerSchemeVersion);
auto ps = artic::PrimerScheme(primerSchemeFile, primerSchemeVersion);

// setup and run the softmasker
auto masker = artic::Softmasker(&ps, bamFile, userCmd.str(), minMAPQ, normalise, removeBadPairs, noReadGroups, primerStart, outFileName);
masker.Run(verbose);
});

// add the getter callback
getterCmd->callback([&]() {
artic::DownloadScheme(getSchemeName, getSchemeVersion, outFileName);
});

// add the validator callback
validatorCmd->callback([&]() {
auto ps = (ignoreVersion) ? artic::PrimerScheme(primerSchemeFile) : artic::PrimerScheme(primerSchemeFile, primerSchemeVersion);
auto ps = artic::PrimerScheme(primerSchemeFile, primerSchemeVersion);
std::cout << "primer scheme file:\t" << primerSchemeFile << std::endl;
if (!ignoreVersion)
std::cout << "primer scheme version:\t" << ps.GetVersion() << std::endl;
else
std::cout << "primer scheme version:\tunversioned" << std::endl;
std::cout << "primer scheme version:\t" << ps.GetVersion() << std::endl;
std::cout << "reference sequence ID:\t" << ps.GetReferenceName() << std::endl;
std::cout << "number of pools:\t" << ps.GetPrimerPools().size() << std::endl;
std::cout << "number of primers:\t" << ps.GetNumPrimers() << " (includes " << ps.GetNumAlts() << " alts)" << std::endl;
Expand Down Expand Up @@ -130,7 +136,7 @@ int main(int argc, char** argv)

// add the vcfFilter callback
vcfFilterCmd->callback([&]() {
auto ps = (ignoreVersion) ? artic::PrimerScheme(primerSchemeFile) : artic::PrimerScheme(primerSchemeFile, primerSchemeVersion);
auto ps = artic::PrimerScheme(primerSchemeFile, primerSchemeVersion);

// setup and run the softmasker
auto filter = artic::VcfChecker(&ps, vcfIn, outFileName, dropPrimerVars, dropOverlapFails);
Expand Down
25 changes: 0 additions & 25 deletions artic/primerScheme.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@ const std::string LEFT_PRIMER_TAG = "_LEFT";
const std::string RIGHT_PRIMER_TAG = "_RIGHT";
const std::string ALT_PRIMER_TAG = "_alt";
const std::string NO_POOL = "unmatched";
const unsigned int NUM_ARTIC_PRIMERS_v1_v2 = 196;
const unsigned int NUM_ARTIC_PRIMERS_v3 = 218;

// Primer constructor.
artic::Primer::Primer(unsigned int start, unsigned int end, const std::string primerID, size_t poolID)
Expand Down Expand Up @@ -103,33 +101,10 @@ const std::string artic::Primer::GetSeq(faidx_t* reference, const std::string& r
}

// PrimerScheme constructor.
artic::PrimerScheme::PrimerScheme(const std::string& inputFile)
{
_loadScheme(inputFile);
_validateScheme();
}

// PrimerScheme constructor with version checking.
artic::PrimerScheme::PrimerScheme(const std::string& inputFile, unsigned int schemeVersion)
: _version(schemeVersion)
{

// check the version provided
if (_version < 1 || _version > 3)
throw std::runtime_error("unrecognised primer scheme version - " + std::to_string(_version));

// load the scheme
_loadScheme(inputFile);

// check primer count matches scheme version
if (_version < 3)
if (_numPrimers != NUM_ARTIC_PRIMERS_v1_v2)
throw std::runtime_error("primer count does not equal the number required by the scheme version - " + std::to_string(_numPrimers) + " vs " + std::to_string(NUM_ARTIC_PRIMERS_v1_v2));
if (_version == 3)
if (_numPrimers != NUM_ARTIC_PRIMERS_v3)
throw std::runtime_error("primer count does not equal the number required by the scheme version - " + std::to_string(_numPrimers) + " vs " + std::to_string(NUM_ARTIC_PRIMERS_v3));

// check the scheme
_validateScheme();
}

Expand Down
4 changes: 3 additions & 1 deletion artic/primerScheme.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@

namespace artic
{
// DownloadScheme will download a specified primer scheme and the reference sequence.
void DownloadScheme(const std::string& schemeName, unsigned int requestedVersion, const std::string& outDir);

class Primer;
class PrimerScheme;
class Amplicon;
Expand Down Expand Up @@ -75,7 +78,6 @@ namespace artic
{
public:
// PrimerScheme constructors and destructor.
PrimerScheme(const std::string& inputFile);
PrimerScheme(const std::string& inputFile, unsigned int schemeVersion);
~PrimerScheme(void);

Expand Down
110 changes: 110 additions & 0 deletions artic/primerSchemeDownloader.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#include <boost/filesystem.hpp>
#include <iostream>
#include <map>
#include <sstream>

#include "log.hpp"
#include "primerScheme.hpp"

// ARTIC scheme downloads
// Note: it would be nice to have all these in one place and under a unified naming system...
const std::vector<std::string> ebovUrls = {"https://github.com/artic-network/primer-schemes/raw/master/ZaireEbola/V1/ZaireEbola.scheme.bed", "https://github.com/artic-network/primer-schemes/raw/master/ZaireEbola/V2/ZaireEbola.scheme.bed", "https://github.com/artic-network/primer-schemes/raw/master/ZaireEbola/V3/Ebov-10-Pan.bed"};
const std::vector<std::string> nivUrls = {"https://github.com/artic-network/primer-schemes/raw/master/Nipah/V1/NiV_6_Malaysia.bed"};
const std::vector<std::string> scovUrls = {"https://github.com/artic-network/artic-ncov2019/raw/master/primer_schemes/nCoV-2019/V1/nCoV-2019.scheme.bed", "https://github.com/artic-network/artic-ncov2019/raw/master/primer_schemes/nCoV-2019/V2/nCoV-2019.scheme.bed", "https://github.com/artic-network/artic-ncov2019/raw/master/primer_schemes/nCoV-2019/V3/nCoV-2019.scheme.bed"};

const std::vector<std::string> ebovRefUrls = {"https://github.com/artic-network/primer-schemes/raw/master/ZaireEbola/V1/ZaireEbola.reference.fasta", "https://github.com/artic-network/primer-schemes/raw/master/ZaireEbola/V2/ZaireEbola.reference.fasta", "https://github.com/artic-network/primer-schemes/raw/master/ZaireEbola/V3/Ebov-10-Pan.fasta"};
const std::vector<std::string> nivRefUrls = {"https://github.com/artic-network/primer-schemes/raw/master/Nipah/V1/NiV_6_Malaysia.fasta"};
const std::vector<std::string> scovRefUrls = {"https://github.com/artic-network/artic-ncov2019/raw/master/primer_schemes/nCoV-2019/V1/nCoV-2019.reference.fasta", "https://github.com/artic-network/artic-ncov2019/raw/master/primer_schemes/nCoV-2019/V2/nCoV-2019.reference.fasta", "https://github.com/artic-network/artic-ncov2019/raw/master/primer_schemes/nCoV-2019/V3/nCoV-2019.reference.fasta"};

enum Scheme
{
InvalidScheme,
Ebov,
Nipah,
Scov2,
};

Scheme resolveScheme(std::string query)
{
static const std::map<std::string, Scheme> schemeStrings{
{"ebola", Ebov},
{"nipah", Nipah},
{"scov2", Scov2},
};
auto itr = schemeStrings.find(query);
if (itr != schemeStrings.end())
return itr->second;
return InvalidScheme;
}

// DownloadScheme will download a specified primer scheme and the reference sequence.
// if 0 is provided as a version, or an unknown version provided, the latest scheme version will be used.
void artic::DownloadScheme(const std::string& schemeName, unsigned int requestedVersion, const std::string& outDir)
{
std::string schemeURL;
std::string refURL;
unsigned int version = requestedVersion;
switch (resolveScheme(schemeName))
{
case Ebov:
if (version == 0 || version > ebovUrls.size())
version = ebovUrls.size();
schemeURL = ebovUrls.at(version - 1);
refURL = ebovRefUrls.at(version - 1);
break;
case Nipah:
if (version == 0 || version > nivUrls.size())
version = nivUrls.size();
schemeURL = nivUrls.at(version - 1);
refURL = nivRefUrls.at(version - 1);
break;
case Scov2:
if (version == 0 || version > scovUrls.size())
version = scovUrls.size();
schemeURL = scovUrls.at(version - 1);
refURL = scovRefUrls.at(version - 1);
break;
default:
throw std::runtime_error("unknown scheme: " + schemeName);
}
artic::Log::Init("getter");
LOG_TRACE("starting scheme getter");
LOG_TRACE("\tscheme: {}", schemeName);
LOG_TRACE("\tversion: {}", requestedVersion);
if ((requestedVersion != version) && (requestedVersion != 0))
LOG_WARN("requested scheme version not found, using latest version instead (v{})", version);

// create filenames
std::stringstream of1;
std::stringstream of2;
if (outDir.size() != 0)
{
if (!boost::filesystem::is_directory(outDir) || !boost::filesystem::exists(outDir))
boost::filesystem::create_directory(outDir);
of1 << outDir << "/";
of2 << outDir << "/";
}
of1 << schemeName << ".v" << version << ".scheme.bed";
of2 << schemeName << ".v" << version << ".reference.fasta";

// download using wget sys call
LOG_TRACE("downloading data")
std::stringstream baseCmd;
baseCmd << "wget -q -O ";
baseCmd << schemeName << ".v" << version;
std::string cmd1 = "wget -q -O " + of1.str() + " " + schemeURL + " 2>/dev/null";
std::string cmd2 = "wget -q -O " + of2.str() + " " + refURL + " 2>/dev/null";
int res;
res = system(cmd1.c_str());
if (res != EXIT_SUCCESS)
throw std::runtime_error("could not download scheme with wget");
res = system(cmd2.c_str());
if (res != EXIT_SUCCESS)
throw std::runtime_error("could not download reference with wget");
LOG_TRACE("\t{}", of1.str());
LOG_TRACE("\t{}", of2.str());

// validate the scheme
LOG_TRACE("checking scheme")
auto ps = artic::PrimerScheme(of1.str(), version);
}
1 change: 0 additions & 1 deletion artic/vcfCheck.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ artic::VcfChecker::~VcfChecker(void)
// Run will perform the softmasking on the open BAM file.
void artic::VcfChecker::Run()
{
std::cout << _outfileName << std::endl;
artic::Log::Init("vcfchecker");
LOG_TRACE("starting VCF checker");
if ((_outfileName.size() != 0))
Expand Down
16 changes: 13 additions & 3 deletions docs/commands.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,16 @@ Example usage:
artic-tools align_trim -b in.bam primerscheme.bed > out.bam 2> out.log
```

## get_scheme

The `get_scheme` command can download primer schemes and sequences for several ARTIC references.

Example usage:

```
artic-tools get_scheme nipah
```

## validate_scheme

The `validate_scheme` command is used to check your primer scheme confirms to an ARTIC standard and can be used in our pipelines.
Expand Down Expand Up @@ -69,10 +79,10 @@ Example ouput:
[14:13:50] artic-tools::vcfchecker: nothing seen at position yet, holding var
[14:13:50] artic-tools::vcfchecker: variant at pos 22868: TG->T
[14:13:50] artic-tools::vcfchecker: located within an amplicon overlap region
[14:13:50] artic-tools::vcfchecker: var pos does not match with that of previously identified overlap, holding var (and dropping held var at 22862)
[14:13:50] artic-tools::vcfchecker: var pos does not match with that of previously identified overlap var, holding new var (and dropping held var at 22862)
[14:13:50] artic-tools::vcfchecker: variant at pos 22896: T->TTGG
[14:13:50] artic-tools::vcfchecker: located within an amplicon overlap region
[14:13:50] artic-tools::vcfchecker: var pos does not match with that of previously identified overlap, holding var (and dropping held var at 22867)
[14:13:50] artic-tools::vcfchecker: var pos does not match with that of previously identified overlap var, holding new var (and dropping held var at 22867)
[14:13:50] artic-tools::vcfchecker: variant at pos 22909: TA->T
[14:13:50] artic-tools::vcfchecker: variant at pos 22913: T->C
[14:13:50] artic-tools::vcfchecker: variant at pos 22916: CT->TC
Expand All @@ -85,7 +95,7 @@ Example ouput:
[14:13:50] artic-tools::vcfchecker: variant at pos 23098: AC->GT
[14:13:50] artic-tools::vcfchecker: variant at pos 23183: T->TC
[14:13:50] artic-tools::vcfchecker: located within an amplicon overlap region
[14:13:50] artic-tools::vcfchecker: var pos does not match with that of previously identified overlap, holding var (and dropping held var at 22895)
[14:13:50] artic-tools::vcfchecker: var pos does not match with that of previously identified overlap var, holding new var (and dropping held var at 22895)
[14:13:50] artic-tools::vcfchecker: variant at pos 23403: A->G
[14:13:50] artic-tools::vcfchecker: variant at pos 27752: C->T
[14:13:50] artic-tools::vcfchecker: variant at pos 28881: GGG->AAC
Expand Down
15 changes: 0 additions & 15 deletions tests/primerScheme_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,21 +35,6 @@ TEST(primerscheme, constructor)
FAIL() << "expected std::runtime_error";
}

// catch bad version
try
{
artic::PrimerScheme(inputScheme, 666);
FAIL() << "expected a no file error";
}
catch (std::runtime_error& err)
{
EXPECT_EQ(err.what(), std::string("unrecognised primer scheme version - 666"));
}
catch (...)
{
FAIL() << "expected std::runtime_error";
}

// constructor should pass
try
{
Expand Down

0 comments on commit a074397

Please sign in to comment.