Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix reconsensus #95

Merged
merged 19 commits into from
Nov 4, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions packages/pangraph/src/commands/build/build_run.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use crate::utils::random::get_random_number_generator;
use crate::{make_internal_error, make_internal_report};
use eyre::{Report, WrapErr};
use itertools::Itertools;
use log::info;

pub fn build_run(args: &PangraphBuildArgs) -> Result<(), Report> {
let PangraphBuildArgs { input_fastas, seed, .. } = &args;
Expand Down Expand Up @@ -66,7 +67,21 @@ pub fn build(fastas: Vec<FastaRecord>, args: &PangraphBuildArgs) -> Result<Pangr
// Case: internal node with two children. Action: produce graph for this node based on the graphs of its children.
// Assumption: Child nodes are assumed to be already visited at this point.
if let (Some(left), Some(right)) = (&left.read().data, &right.read().data) {
info!(
"=== Graph merging start: clades sizes {} + {}",
left.paths.len(),
right.paths.len()
);

clade.data = Some(merge_graphs(left, right, args).wrap_err("When merging graphs")?);

info!(
"=== Graph merging completed: clades sizes {} + {} -> {}",
left.paths.len(),
right.paths.len(),
clade.data.as_ref().unwrap().paths.len()
);

Ok(())
} else {
make_internal_error!("Found internal clade with two children, of which one or both have no graph attached.")
Expand Down
91 changes: 91 additions & 0 deletions packages/pangraph/src/pangraph/edits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use std::ops::Range;

#[cfg(any(test, debug_assertions))]
use eyre::eyre;

#[must_use]
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash, JsonSchema)]
pub struct Sub {
Expand Down Expand Up @@ -215,6 +218,94 @@ impl Edit {
let qry = String::from_iter(&qry);
Ok(qry)
}

#[cfg(any(test, debug_assertions))]
pub fn sanity_check(&self, len: usize) -> Result<(), Report> {
let block_interval = Interval::new(0, len);
// === substitution checks ===
for sub in &self.subs {
if !block_interval.contains(sub.pos) {
return Err(eyre!(
"Substitution position {} is out of bounds for sequence of length {}",
sub.pos,
len
));
}
if sub.alt == '-' {
return Err(eyre!("Substitution with deletion character '-' is not allowed"));
}
}

// check that there are no two substitutions at the same position
for i in 0..self.subs.len() {
for j in i + 1..self.subs.len() {
if self.subs[i].pos == self.subs[j].pos {
return Err(eyre!(
"Substitution {:?} overlaps with substitution {:?}",
self.subs[i],
self.subs[j]
));
}
}
}

// check that substitutions do not overlap with deletions
for sub in &self.subs {
for del in &self.dels {
if del.contains(sub.pos) {
return Err(eyre!("Substitution {:?} overlaps with deletion {:?}", sub, del));
}
}
}

// === deletion checks ===
for del in &self.dels {
if del.len == 0 {
return Err(eyre!("Deletion {:?} has length 0", del));
}

if !block_interval.contains(del.pos) {
return Err(eyre!(
"Deletion {:?} is out of bounds for sequence of length {}",
del,
len
));
}

if del.end() > len {
return Err(eyre!(
"Deletion {:?} is out of bounds for sequence of length {}",
del,
len
));
}
}

// check that deletions are non-overlapping
// for all pairs of deletions
for i in 0..self.dels.len() {
for j in i + 1..self.dels.len() {
let del_i = &self.dels[i];
let del_j = &self.dels[j];
if del_i.interval().has_overlap_with(&del_j.interval()) {
return Err(eyre!("Deletion {:?} overlaps with deletion {:?}", del_i, del_j));
}
}
}

// === insertion checks ===
for ins in &self.inss {
if ins.pos > len {
return Err(eyre!(
"Insertion position {} is out of bounds for sequence of length {}",
ins.pos,
len
));
}
}

Ok(())
}
}

#[cfg(test)]
Expand Down
3 changes: 3 additions & 0 deletions packages/pangraph/src/pangraph/graph_merging.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ pub fn merge_graphs(
debug!("Removing transitive edges");
remove_transitive_edges(&mut graph).wrap_err("When removing transitive edges")?;

#[cfg(any(test, debug_assertions))]
graph.sanity_check().wrap_err("After merging graphs")?;

Ok(graph)
}

Expand Down
62 changes: 62 additions & 0 deletions packages/pangraph/src/pangraph/pangraph.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,68 @@ impl Pangraph {
}
}
}

#[cfg(any(test, debug_assertions))]
pub fn sanity_check(&self) -> Result<(), Report> {
for (node_id, node) in &self.nodes {
let block = &self.blocks[&node.block_id()];
let path = &self.paths[&node.path_id()];

if !block.alignments().contains_key(node_id) {
return Err(eyre::eyre!("Node {} not found in block {}", node_id, block.id()));
}

if !path.nodes.contains(node_id) {
return Err(eyre::eyre!("Node {} not found in path {}", node_id, path.id()));
}
}

for (block_id, block) in &self.blocks {
for (node_id, _) in block.alignments() {
if !self.nodes.contains_key(node_id) {
return Err(eyre::eyre!("Node {} not found in graph", node_id));
}
}
}

for (path_id, path) in &self.paths {
for node_id in &path.nodes {
if !self.nodes.contains_key(node_id) {
return Err(eyre::eyre!("Node {} not found in graph", node_id));
}
}

// check that nodes in the same path have contiguous positions
let mut prev_pos = self.nodes[&path.nodes.first().unwrap()].position().1;
for &node_id in &path.nodes[1..] {
let pos = self.nodes[&node_id].position().0;
if pos != prev_pos {
return Err(eyre::eyre!(
"Node {} in path {} has position {} but previous node has position {}",
node_id,
path_id,
pos,
prev_pos
));
}
prev_pos = self.nodes[&node_id].position().1;
}
if path.circular() {
let first_pos = self.nodes[&path.nodes.first().unwrap()].position().0;
let last_pos = self.nodes[&path.nodes.last().unwrap()].position().1;
if first_pos != last_pos {
return Err(eyre::eyre!(
"Circular path {} has first node position {} different from last node position {}",
path_id,
first_pos,
last_pos
));
}
}
}

Ok(())
}
}

impl FromStr for Pangraph {
Expand Down
12 changes: 6 additions & 6 deletions packages/pangraph/src/pangraph/pangraph_node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,12 @@ impl PangraphNode {
}
}

pub fn len(&self) -> usize {
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I this this function had a bug. But due to possibly having circular genomes, the length of a node cannot always be determined by only start-end positions

self.position.1.saturating_sub(self.position.1)
}

pub fn is_empty(&self) -> bool {
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Therefore also this function was wrong. I thought about defining is_emtpy as checking if the start position coincides with the end position, but this is also wrong in one edge case: when the path is circular and composed of a single node, then the start/end position of the node can be the same even if the node is not empty.

self.len() == 0
// this is almost equivalent to checking if the node is empty
// except for an edge case: when a circular path contains only
// one node. In this case even if the node is not empty, the
// start and end are the same.
pub fn start_is_end(&self) -> bool {
self.position.0 == self.position.1
}
}

Expand Down
3 changes: 3 additions & 0 deletions packages/pangraph/src/pangraph/reweave.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ impl MergePromise {
map_variations(self.anchor_block.consensus(), seq)?
};

#[cfg(any(test, debug_assertions))]
edits.sanity_check(self.anchor_block.consensus().len())?;

Ok((*node_id, edits))
})
.collect::<Result<Vec<_>, Report>>()?
Expand Down
7 changes: 7 additions & 0 deletions packages/pangraph/src/pangraph/slice.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@ pub fn block_slice(
let mut new_alignment = BTreeMap::new();

for (old_node_id, edits) in b.alignments() {
#[cfg(any(debug_assertions, test))]
edits.sanity_check(b.consensus_len()).unwrap();

let old_node = &G.nodes[old_node_id];
let old_strandedness = old_node.strand();

Expand All @@ -137,6 +140,10 @@ pub fn block_slice(
node_updates.insert(*old_node_id, new_node.clone());

let new_edits = slice_edits(i, edits, block_L);

#[cfg(any(debug_assertions, test))]
new_edits.sanity_check(new_consensus.len()).unwrap();

let ovw = new_alignment.insert(new_node.id(), new_edits);
debug_assert!(ovw.is_none()); // new node id is not already in new_alignment
}
Expand Down
35 changes: 29 additions & 6 deletions packages/pangraph/src/reconsensus/reconsensus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ fn reconsensus(block: &mut PangraphBlock) -> Result<(), Report> {
if !ins.is_empty() || !dels.is_empty() {
let consensus = block.consensus();
let consensus = apply_indels(consensus, &dels, &ins);

// debug assert: consensus is not empty
debug_assert!(!consensus.is_empty(), "Consensus is empty after indels");

update_block_consensus(block, &consensus)?;
}
Ok(())
Expand Down Expand Up @@ -87,8 +91,11 @@ fn reconsensus_mutations(block: &mut PangraphBlock) -> Result<(), Report> {
let subs_at_pos: Vec<_> = edit.subs.iter().filter(|s| s.pos == pos).cloned().collect();
match subs_at_pos.len() {
0 => {
edit.subs.push(Sub::new(pos, original));
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This fixed the bug: the substitution should be applied only if the position is not deleted.

edit.subs.sort_by_key(|s| s.pos);
// if the position is not in a deletion, append the mutation
if !edit.dels.iter().any(|d| d.contains(pos)) {
edit.subs.push(Sub::new(pos, original));
edit.subs.sort_by_key(|s| s.pos);
}
}
1 => {
let s = &subs_at_pos[0];
Expand Down Expand Up @@ -177,6 +184,22 @@ fn update_block_consensus(block: &mut PangraphBlock, consensus: impl Into<String
.map(|(&nid, edit)| Ok((nid, edit.apply(block.consensus())?)))
.collect::<Result<BTreeMap<NodeId, String>, Report>>()?;

// debug assets: all sequences are non-empty
#[cfg(any(debug_assertions, test))]
{
for (nid, seq) in &seqs {
if seq.is_empty() {
return make_error!(
"node is empty!\nblock: {}\nnode: {}\nedits: {:?}\nconsensus: {}",
block.id(),
nid,
block.alignments().get(nid),
block.consensus()
);
}
}
}

// Re-align sequences
let alignments = seqs
.into_iter()
Expand All @@ -203,14 +226,14 @@ mod tests {
// 01234567890123456789012
// node 1) .T...--..........A.....
// node 2) .T...--...C......|.....
// node 3) .T...--...C............
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

changed the test so that it hits this edge-case

// node 3) .T...--...C.....--.....
// node 4) .C.......---.....A.....
// node 5) .....|...........A.....
// L = 23, N = 5
let aln = btreemap! {
NodeId(1) => Edit::new(vec![], vec![Del::new(5, 2)], vec![Sub::new(1, 'T'), Sub::new(17, 'A')]),
NodeId(2) => Edit::new(vec![Ins::new(17, "CAAT")], vec![Del::new(5, 2)], vec![Sub::new(1, 'T'), Sub::new(10, 'C')]),
NodeId(3) => Edit::new(vec![], vec![Del::new(5, 2)], vec![Sub::new(1, 'T'), Sub::new(10, 'C')]),
NodeId(3) => Edit::new(vec![], vec![Del::new(5, 2), Del::new(16,2)], vec![Sub::new(1, 'T'), Sub::new(10, 'C')]),
NodeId(4) => Edit::new(vec![], vec![Del::new(9, 3)], vec![Sub::new(1, 'C'), Sub::new(17, 'A')]),
NodeId(5) => Edit::new(vec![], vec![Del::new(5, 2)], vec![Sub::new(17, 'A')]),
};
Expand All @@ -223,14 +246,14 @@ mod tests {
// 01234567890123456789012
// node 1) .....--................
// node 2) .....--...C......G|....
// node 3) .....--...C......G.....
// node 3) .....--...C.....--.....
// node 4) .C.......---...........
// node 5) .G...|.................
// L = 23, N = 5
let aln = btreemap! {
NodeId(1) => Edit::new(vec![], vec![Del::new(5, 2)], vec![]),
NodeId(2) => Edit::new(vec![Ins::new(17, "CAAT")], vec![Del::new(5, 2)], vec![Sub::new(10, 'C'), Sub::new(17, 'G')]),
NodeId(3) => Edit::new(vec![], vec![Del::new(5, 2)], vec![Sub::new(10, 'C'), Sub::new(17, 'G')]),
NodeId(3) => Edit::new(vec![], vec![Del::new(5, 2), Del::new(16,2)], vec![Sub::new(10, 'C')]),
NodeId(4) => Edit::new(vec![], vec![Del::new(9, 3)], vec![Sub::new(1, 'C')]),
NodeId(5) => Edit::new(vec![], vec![Del::new(5, 2)], vec![Sub::new(1, 'G')]),
};
Expand Down
33 changes: 32 additions & 1 deletion packages/pangraph/src/reconsensus/remove_nodes.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use log::info;

use crate::pangraph::pangraph::Pangraph;
use crate::pangraph::pangraph_block::BlockId;
use crate::pangraph::pangraph_node::NodeId;
Expand All @@ -21,13 +23,42 @@ fn find_empty_nodes(graph: &Pangraph, block_ids: &[BlockId]) -> Vec<NodeId> {
for &block_id in block_ids {
let block = &graph.blocks[&block_id];
let cons_len = block.consensus_len();

debug_assert!(
cons_len > 0,
"Block {} has a consensus length of 0 and should have been removed",
block_id
);

for (&node_id, edits) in block.alignments() {
// check that edits are non-overlapping and well-defined
#[cfg(any(test, debug_assertions))]
edits.sanity_check(cons_len).unwrap();

// if the node has insertions or substitutions, or the total deletion size is less than the consensus length
// then it is not empty
if !edits.inss.is_empty() || !edits.subs.is_empty() || edits.dels.is_empty() {
// check that the node is not empty
debug_assert!(
!graph.nodes[&node_id].start_is_end(),
"Node {} with edits {:?} and consensus length {} is empty and should have been removed",
node_id,
edits,
cons_len
);

continue;
}

// if the node has no insertions or substitutions and the total deletion size is equal to the consensus length
// then it is empty and should be removed
if edits.dels.iter().map(|d| d.len).sum::<usize>() == cons_len {
info!(
"empty node {} with edits {:?} and consensus length {} is empty: to be removed",
node_id, edits, cons_len
);
debug_assert!(graph.nodes[&node_id].start_is_end());
node_ids_to_delete.push(node_id);
debug_assert!(graph.nodes[&node_id].is_empty());
}
}
}
Expand Down
Loading