Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change to PanSN "scaffolds" with colon and dash coordinate ranges #97

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion src/copy_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ void copy_path(const PathHandleGraph* from, const path_handle_t& from_path,
from->get_sample_name(from_path),
from->get_locus_name(from_path),
from->get_haplotype(from_path),
from->get_phase_block(from_path),
from->get_subrange(from_path),
from->get_is_circular(from_path));

Expand Down
4 changes: 1 addition & 3 deletions src/include/handlegraph/mutable_path_metadata.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@ class MutablePathMetadata : virtual public PathMetadata {

/**
* Add a path with the given metadata. Any item can be the corresponding
* unset sentinel (PathMetadata::NO_LOCUS_NAME,
* PathMetadata::NO_PHASE_BLOCK, etc.).
* unset sentinel (PathMetadata::NO_LOCUS_NAME, etc.).
*
* Implementations may refuse to store paths-or-threads of certain senses
* when relevant fields are unset.
Expand All @@ -43,7 +42,6 @@ class MutablePathMetadata : virtual public PathMetadata {
const std::string& sample,
const std::string& locus,
const size_t& haplotype,
const size_t& phase_block,
const subrange_t& subrange,
bool is_circular = false);

Expand Down
6 changes: 4 additions & 2 deletions src/include/handlegraph/path_handle_graph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ class PathHandleGraph : virtual public HandleGraph, virtual public PathMetadata
/// visible here. Only reference or generic named paths should be visible.
template<typename Iteratee>
bool for_each_step_on_handle(const handle_t& handle, const Iteratee& iteratee) const;

////////////////////////////////////////////////////////////////////////////
// Backing protected virtual methods that need to be implemented
////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -154,6 +154,9 @@ class PathHandleGraph : virtual public HandleGraph, virtual public PathMetadata
/// Returns true if the given path is empty, and false otherwise
virtual bool is_empty(const path_handle_t& path_handle) const;

/// Measure the length of a path.
virtual size_t get_path_length(const path_handle_t& path_handle) const;

////////////////////////////////////////////////////////////////////////////
// Concrete utility methods
////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -186,7 +189,6 @@ bool PathHandleGraph::for_each_step_on_handle(const handle_t& handle, const Iter
return for_each_step_on_handle_impl(handle, BoolReturningWrapper<Iteratee>::wrap(iteratee));
}


template<typename Iteratee>
bool PathHandleGraph::for_each_step_in_path(const path_handle_t& path, const Iteratee& iteratee) const {

Expand Down
133 changes: 97 additions & 36 deletions src/include/handlegraph/path_metadata.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@ namespace handlegraph {
*
* Our model is that paths come in different "senses":
*
* - SENSE_GENERIC: a generic named path. Has a "locus" name.
* - PathSense::GENERIC: a generic named path. Has a "locus" name.
*
* - SENSE_REFERENCE: a part of a reference assembly. Has a "sample" name, a
* "locus" name, and a haplotype number.
* - PathSense::REFERENCE: a part of a reference assembly. Has a "sample" name,
* a "locus" name, and a haplotype number.
*
* - SENSE_HAPLOTYPE: a haplotype from a particular individual. Has a "sample"
* name, a "locus" name, a haplotype number, and a phase block identifier.
* - PathSense::HAPLOTYPE: a haplotype from a particular individual. Has a
* "sample" name, a "locus" name, a haplotype number.
*
* Paths of all sneses can represent subpaths, with bounds.
*
Expand All @@ -43,15 +43,11 @@ namespace handlegraph {
* represented. GFA uses a convention where the presence of a haplotype 0
* implies that only one haplotype is present.
*
* - Phase block identifier: Distinguishes fragments of a haplotype that are
* phased but not necessarily part of a single self-consistent scaffold (often
* due to self-contradictory VCF information). Must be unique within a sample,
* locus, and haplotype. May be a number or a start coordinate.
* - Subrange, for when a path as stored gives only a sub-range of a conceptually
* longer scaffold. Multiple items can be stored with identical metadata in the
* other fields if their subranges are non-overlapping. For haplotypes, the
* subrange coordinates may be synthetic.
*
* - Bounds, for when a path as stored gives only a sub-range of a conceptually
* longer path. Multiple items can be stored with identical metadata in the
* other fields if their bounds are non-overlapping.
* TODO: Interaction with phase block in GBWT???
*/
class PathMetadata {
public:
Expand Down Expand Up @@ -84,17 +80,11 @@ class PathMetadata {
virtual std::string get_locus_name(const path_handle_t& handle) const;
static const std::string NO_LOCUS_NAME;

/// Get the haplotype number (0 or 1, for diploid) of the path-or-thread,
/// Get the haplotype number (0 for haploid, 1 or 2 for diploid) of the path-or-thread,
/// or NO_HAPLOTYPE if it does not belong to one.
virtual size_t get_haplotype(const path_handle_t& handle) const;
static const size_t NO_HAPLOTYPE;

/// Get the phase block number (contiguously phased region of a sample,
/// contig, and haplotype) of the path-or-thread, or NO_PHASE_BLOCK if it
/// does not belong to one.
virtual size_t get_phase_block(const path_handle_t& handle) const;
static const size_t NO_PHASE_BLOCK;

/// Get the bounds of the path-or-thread that are actually represented
/// here. Should be NO_SUBRANGE if the entirety is represented here, and
/// 0-based inclusive start and exclusive end positions of the stored
Expand All @@ -105,13 +95,21 @@ class PathMetadata {
virtual subrange_t get_subrange(const path_handle_t& handle) const;
static const subrange_t NO_SUBRANGE;
static const offset_t NO_END_POSITION;

/// Get the name of the scaffold that the path is on. This is the path name
/// without any subrange information.
virtual std::string get_path_scaffold_name(const path_handle_t& handle) const;

/// Get the region that a path covers on its scaffold. Will compute the end
/// coordinate if not stored.
virtual region_t get_path_region(const path_handle_t& handle) const;

////////////////////////////////////////////////////////////////////////////
// Tools for converting back and forth with single-string path names
////////////////////////////////////////////////////////////////////////////

/// Extract the sense of a path from the given formatted path name, if
/// possible. If not possible, return SENSE_GENERIC.
/// possible. If not possible, return PathSense::GENERIC.
static PathSense parse_sense(const std::string& path_name);

/// Get the name of the sample or assembly embedded in the given formatted
Expand All @@ -122,37 +120,39 @@ class PathMetadata {
/// path name, or NO_LOCUS_NAME if it does not belong to one.
static std::string parse_locus_name(const std::string& path_name);

/// Get the haplotype number (0 or 1, for diploid) embedded in the given
/// Get the haplotype number (0 for haploid, 1 or 2 for diploid) embedded in the given
/// formatted path name, or NO_HAPLOTYPE if it does not belong to one.
static size_t parse_haplotype(const std::string& path_name);

/// Get the phase block number (contiguously phased region of a sample,
/// contig, and haplotype) embedded in the given formatted path name, or
/// NO_PHASE_BLOCK if it does not belong to one.
static size_t parse_phase_block(const std::string& path_name);

/// Get the bounds embedded in the given formatted path name, or
/// NO_SUBRANGE if they are absent. If no end position is stored,
/// NO_END_POSITION may be returned for the end position.
static subrange_t parse_subrange(const std::string& path_name);

/// Decompose a formatted path name into metadata.
/// Expects 1-based, end-inclusive coordinates in subranges in the name,
/// and emits 0-based, end-exclusive coordinates.
static void parse_path_name(const std::string& path_name,
PathSense& sense,
std::string& sample,
std::string& locus,
size_t& haplotype,
size_t& phase_block,
subrange_t& subrange);

/// Decompose a scaffold name (without range) into metadata (without sense)
static void parse_scaffold_name(const std::string& scaffold_name,
std::string& sample,
std::string& locus,
size_t& haplotype);

/// Compose a formatted path name for the given metadata. Any item can be
/// the corresponding unset sentinel (PathMetadata::NO_LOCUS_NAME,
/// PathMetadata::NO_PHASE_BLOCK, etc.).
/// the corresponding unset sentinel (PathMetadata::NO_LOCUS_NAME, etc.).
/// Expects 0-based, end-exclusive coordinates and procudes 1-based,
/// end-inclusive coordinates in the name.
static std::string create_path_name(const PathSense& sense,
const std::string& sample,
const std::string& locus,
const size_t& haplotype,
const size_t& phase_block,
const subrange_t& subrange);

////////////////////////////////////////////////////////////////////////////
Expand All @@ -177,6 +177,16 @@ class PathMetadata {
const std::unordered_set<std::string>* samples,
const std::unordered_set<std::string>* loci,
const Iteratee& iteratee) const;

/// Loop through all the paths matching the given query. Query elements
/// which are null match everything. Returns false and stops if the
/// iteratee returns false.
template<typename Iteratee>
bool for_each_path_matching(const std::unordered_set<PathSense>* senses,
const std::unordered_set<std::string>* samples,
const std::unordered_set<std::string>* loci,
const std::unordered_set<size_t>* haplotypes,
const Iteratee& iteratee) const;

/// Loop through all the paths matching the given query. Query elements
/// which are empty match everything. Returns false and stops if the
Expand All @@ -186,6 +196,22 @@ class PathMetadata {
const std::unordered_set<std::string>& samples,
const std::unordered_set<std::string>& loci,
const Iteratee& iteratee) const;

/// Loop through all the paths matching the given query. Query elements
/// which are empty match everything. Returns false and stops if the
/// iteratee returns false.
template<typename Iteratee>
bool for_each_path_matching(const std::unordered_set<PathSense>& senses,
const std::unordered_set<std::string>& samples,
const std::unordered_set<std::string>& loci,
const std::unordered_set<size_t>& haplotypes,
const Iteratee& iteratee) const;

/// Loop through all the paths on the scaffold with the given name. Paths
/// are not necessarily visited in order.
template<typename Iteratee>
bool for_each_path_on_scaffold(const std::string& scaffold_name,
const Iteratee& iteratee) const;

/// Loop through all steps on the given handle for paths with the given
/// sense. Returns false and stops if the iteratee returns false.
Expand All @@ -210,7 +236,13 @@ class PathMetadata {
virtual bool for_each_path_matching_impl(const std::unordered_set<PathSense>* senses,
const std::unordered_set<std::string>* samples,
const std::unordered_set<std::string>* loci,
const std::unordered_set<size_t>* haplotypes,
const std::function<bool(const path_handle_t&)>& iteratee) const;

/// Loop through the handles of paths that are on the given scaffold. Paths
/// are not necessarily visited in order. Returns false and stops if the
/// iteratee returns false.
virtual bool for_each_path_on_scaffold_impl(const std::string& scaffold, const std::function<bool(const path_handle_t&)>& iteratee) const;

/// Loop through all steps on the given handle for paths with the given
/// sense. Returns false and stops if the iteratee returns false.
Expand All @@ -224,6 +256,9 @@ class PathMetadata {

/// Look up the name of a path from a handle to it
virtual std::string get_path_name(const path_handle_t& path_handle) const = 0;

/// Measure the length of a path.
virtual size_t get_path_length(const path_handle_t& path_handle) const = 0;

/// Returns a handle to the path that an step is on
virtual path_handle_t get_path_handle_of_step(const step_handle_t& step_handle) const = 0;
Expand Down Expand Up @@ -253,11 +288,11 @@ class PathMetadata {
////////////////////////////////////////////////////////////////////////////

static const std::regex FORMAT;
static const std::regex SCAFFOLD_FORMAT;
static const size_t ASSEMBLY_OR_NAME_MATCH;
static const size_t LOCUS_MATCH_NUMERICAL_WITHOUT_HAPLOTYPE;
static const size_t HAPLOTYPE_MATCH;
static const size_t LOCUS_MATCH_ANY;
static const size_t PHASE_BLOCK_MATCH;
static const size_t RANGE_START_MATCH;
static const size_t RANGE_END_MATCH;

Expand All @@ -266,7 +301,6 @@ class PathMetadata {
// Ranges are set off with some additional characters.
static const char RANGE_START_SEPARATOR;
static const char RANGE_END_SEPARATOR;
static const char RANGE_TERMINATOR;
};

////////////////////////////////////////////////////////////////////////////
Expand All @@ -276,34 +310,61 @@ class PathMetadata {
template<typename Iteratee>
bool PathMetadata::for_each_path_of_sense(const PathSense& sense, const Iteratee& iteratee) const {
std::unordered_set<PathSense> senses{sense};
return for_each_path_matching_impl(&senses, nullptr, nullptr, BoolReturningWrapper<Iteratee>::wrap(iteratee));
return for_each_path_matching_impl(&senses, nullptr, nullptr, nullptr, BoolReturningWrapper<Iteratee>::wrap(iteratee));
}

template<typename Iteratee>
bool PathMetadata::for_each_path_of_sample(const std::string& sample, const Iteratee& iteratee) const {
std::unordered_set<std::string> samples{sample};
return for_each_path_matching_impl(nullptr, &samples, nullptr, BoolReturningWrapper<Iteratee>::wrap(iteratee));
return for_each_path_matching_impl(nullptr, &samples, nullptr, nullptr, BoolReturningWrapper<Iteratee>::wrap(iteratee));
}

template<typename Iteratee>
bool PathMetadata::for_each_path_matching(const std::unordered_set<PathSense>* senses,
const std::unordered_set<std::string>* samples,
const std::unordered_set<std::string>* loci,
const Iteratee& iteratee) const {
return for_each_path_matching_impl(senses, samples, loci, nullptr, BoolReturningWrapper<Iteratee>::wrap(iteratee));
}

template<typename Iteratee>
bool PathMetadata::for_each_path_matching(const std::unordered_set<PathSense>* senses,
const std::unordered_set<std::string>* samples,
const std::unordered_set<std::string>* loci,
const std::unordered_set<size_t>* haplotypes,
const Iteratee& iteratee) const {
return for_each_path_matching_impl(senses, samples, loci, haplotypes, BoolReturningWrapper<Iteratee>::wrap(iteratee));
}

template<typename Iteratee>
bool PathMetadata::for_each_path_matching(const std::unordered_set<PathSense>& senses,
const std::unordered_set<std::string>& samples,
const std::unordered_set<std::string>& loci,
const Iteratee& iteratee) const {
return for_each_path_matching_impl(senses, samples, loci, BoolReturningWrapper<Iteratee>::wrap(iteratee));
return for_each_path_matching(senses.empty() ? nullptr : &senses,
samples.empty() ? nullptr : &samples,
loci.empty() ? nullptr : &loci,
iteratee);
}

template<typename Iteratee>
bool PathMetadata::for_each_path_matching(const std::unordered_set<PathSense>& senses,
const std::unordered_set<std::string>& samples,
const std::unordered_set<std::string>& loci,
const std::unordered_set<size_t>& haplotypes,
const Iteratee& iteratee) const {
return for_each_path_matching(senses.empty() ? nullptr : &senses,
samples.empty() ? nullptr : &samples,
loci.empty() ? nullptr : &loci,
haplotypes.empty() ? nullptr : &haplotypes,
iteratee);
}

template<typename Iteratee>
bool PathMetadata::for_each_path_on_scaffold(const std::string& scaffold_name, const Iteratee& iteratee) const {
return for_each_path_on_scaffold_impl(scaffold_name, BoolReturningWrapper<Iteratee>::wrap(iteratee));
}

template<typename Iteratee>
bool PathMetadata::for_each_step_of_sense(const handle_t& visited, const PathSense& sense, const Iteratee& iteratee) const {
return for_each_step_of_sense_impl(visited, sense, BoolReturningWrapper<Iteratee>::wrap(iteratee));
Expand Down
23 changes: 22 additions & 1 deletion src/include/handlegraph/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#include <utility>
#include <functional>
#include <limits>
#include <ostream>
#include <string>

namespace handlegraph {

Expand All @@ -29,9 +31,28 @@ typedef std::size_t offset_t;
[[deprecated("off_t collides with a POSIX type, use offset_t instead")]]
typedef offset_t off_t;

/// Represents a range of offsets, 0-based, end-exclusive
/// Represents a range of offsets, 0-based, end-exclusive.
/// The end may be PathMetadata::NO_END_POSITION.
typedef std::pair<offset_t, offset_t> subrange_t;

/// Represents a position or range on a named scaffold. May partially cover
/// zero or more paths with subranges in a graph. Its subrange must always have
/// a start and an end set.
typedef std::pair<std::string, subrange_t> region_t;

/// Parse a region_t from user-facing one-based end-inclusive coordinates.
/// Raises std::invalid_argument if the provided string is not understandable
/// as a region. The region must include an end coordinate.
region_t parse_region(const std::string& region_text);

/// Turn a region_t into a user-facing one-based end-inclusive coordinate
/// string. The region must include an end coordinate.
std::string to_string(const region_t& region);

/// Write a region_t to a stream as a user-facing one-based end-inclusive
/// coordinate string. The region must include an end coordinate.
std::ostream& operator<<(std::ostream& out, const region_t region);

/// Represents a position
typedef std::tuple<nid_t, bool, offset_t> pos_t;

Expand Down
Loading