diff --git a/packages/pangraph/src/bin/merge_two_graphs.rs b/packages/pangraph/src/bin/merge_two_graphs.rs new file mode 100644 index 00000000..0b2793fe --- /dev/null +++ b/packages/pangraph/src/bin/merge_two_graphs.rs @@ -0,0 +1,69 @@ +use clap::{Parser, ValueHint}; +use ctor::ctor; +use eyre::{Report, WrapErr}; +use log::info; +use pangraph::commands::build::build_args::PangraphBuildArgs; +use pangraph::io::json::{json_write_file, JsonPretty}; +use pangraph::pangraph::graph_merging::merge_graphs; +use pangraph::pangraph::pangraph::Pangraph; +use pangraph::utils::global_init::global_init; +use pangraph::utils::global_init::setup_logger; +use std::path::PathBuf; + +#[ctor] +fn init() { + global_init(); + setup_logger(log::LevelFilter::Debug); +} + +#[derive(Parser, Debug)] +struct Args { + #[clap(flatten)] + pub build_args: PangraphBuildArgs, + + #[clap(long, short = 'L')] + #[clap(value_hint = ValueHint::FilePath)] + pub left_graph: Option, + + #[clap(long, short = 'R')] + #[clap(value_hint = ValueHint::FilePath)] + pub right_graph: Option, +} + +fn main() -> Result<(), Report> { + let mut args = Args::parse(); + + rayon::ThreadPoolBuilder::new() + .num_threads(num_cpus::get()) + .build_global()?; + + // Manually empty the input_fastas field in PangraphBuildArgs + args.build_args.input_fastas = vec![]; + + info!("Reading left graph from {:?}", args.left_graph); + info!("Reading right graph from {:?}", args.right_graph); + + let left = Pangraph::from_path(&args.left_graph).wrap_err("When reading left graph")?; + let right = Pangraph::from_path(&args.right_graph).wrap_err("When reading right graph")?; + + info!("left graph sanity check"); + #[cfg(debug_assertions)] + left.sanity_check()?; + + info!("right graph sanity check"); + #[cfg(debug_assertions)] + right.sanity_check()?; + + info!("Merging graphs"); + + let merged = merge_graphs(&left, &right, &args.build_args)?; + + #[cfg(debug_assertions)] + merged.sanity_check()?; + + info!("Writing merged graph to {:?}", args.build_args.output_json); + + json_write_file(&args.build_args.output_json, &merged, JsonPretty(true))?; + + Ok(()) +} diff --git a/packages/pangraph/src/circularize/circularize.rs b/packages/pangraph/src/circularize/circularize.rs index dd441918..d3b6b358 100644 --- a/packages/pangraph/src/circularize/circularize.rs +++ b/packages/pangraph/src/circularize/circularize.rs @@ -4,11 +4,16 @@ use crate::pangraph::pangraph::Pangraph; use crate::pangraph::pangraph_block::BlockId; use crate::pangraph::pangraph_path::PathId; use eyre::Report; +use log::info; use std::collections::{BTreeMap, HashMap}; /// Removes transitive edges from the graph inplace. pub fn remove_transitive_edges(graph: &mut Pangraph) -> Result<(), Report> { while let Some(edge) = find_transitive_edges(graph).first() { + info!( + "--- Merging transitive edge between blocks {:?} and {:?}", + edge.n1.bid, edge.n2.bid + ); merge_blocks(graph, *edge)?; } Ok(()) diff --git a/packages/pangraph/src/commands/build/build_run.rs b/packages/pangraph/src/commands/build/build_run.rs index c3fe4fe0..c2507d65 100644 --- a/packages/pangraph/src/commands/build/build_run.rs +++ b/packages/pangraph/src/commands/build/build_run.rs @@ -11,6 +11,7 @@ use crate::utils::random::get_random_number_generator; use crate::{make_internal_error, make_internal_report}; use eyre::{Report, WrapErr}; use itertools::Itertools; +use log::info; pub fn build_run(args: &PangraphBuildArgs) -> Result<(), Report> { let PangraphBuildArgs { input_fastas, seed, .. } = &args; @@ -66,7 +67,29 @@ pub fn build(fastas: Vec, args: &PangraphBuildArgs) -> Result {}", + left.paths.len(), + right.paths.len(), + clade.data.as_ref().unwrap().paths.len() + ); + + #[cfg(debug_assertions)] + clade + .data + .as_ref() + .unwrap() + .sanity_check() + .wrap_err("failed sanity check after merging graphs.")?; + Ok(()) } else { make_internal_error!("Found internal clade with two children, of which one or both have no graph attached.") diff --git a/packages/pangraph/src/pangraph/edits.rs b/packages/pangraph/src/pangraph/edits.rs index 1bccb0fe..5fe55521 100644 --- a/packages/pangraph/src/pangraph/edits.rs +++ b/packages/pangraph/src/pangraph/edits.rs @@ -7,6 +7,9 @@ use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use std::ops::Range; +#[cfg(any(test, debug_assertions))] +use eyre::eyre; + #[must_use] #[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash, JsonSchema)] pub struct Sub { @@ -215,6 +218,94 @@ impl Edit { let qry = String::from_iter(&qry); Ok(qry) } + + #[cfg(any(test, debug_assertions))] + pub fn sanity_check(&self, len: usize) -> Result<(), Report> { + let block_interval = Interval::new(0, len); + // === substitution checks === + for sub in &self.subs { + if !block_interval.contains(sub.pos) { + return Err(eyre!( + "Substitution position {} is out of bounds for sequence of length {}", + sub.pos, + len + )); + } + if sub.alt == '-' { + return Err(eyre!("Substitution with deletion character '-' is not allowed")); + } + } + + // check that there are no two substitutions at the same position + for i in 0..self.subs.len() { + for j in i + 1..self.subs.len() { + if self.subs[i].pos == self.subs[j].pos { + return Err(eyre!( + "Substitution {:?} overlaps with substitution {:?}", + self.subs[i], + self.subs[j] + )); + } + } + } + + // check that substitutions do not overlap with deletions + for sub in &self.subs { + for del in &self.dels { + if del.contains(sub.pos) { + return Err(eyre!("Substitution {:?} overlaps with deletion {:?}", sub, del)); + } + } + } + + // === deletion checks === + for del in &self.dels { + if del.len == 0 { + return Err(eyre!("Deletion {:?} has length 0", del)); + } + + if !block_interval.contains(del.pos) { + return Err(eyre!( + "Deletion {:?} is out of bounds for sequence of length {}", + del, + len + )); + } + + if del.end() > len { + return Err(eyre!( + "Deletion {:?} is out of bounds for sequence of length {}", + del, + len + )); + } + } + + // check that deletions are non-overlapping + // for all pairs of deletions + for i in 0..self.dels.len() { + for j in i + 1..self.dels.len() { + let del_i = &self.dels[i]; + let del_j = &self.dels[j]; + if del_i.interval().has_overlap_with(&del_j.interval()) { + return Err(eyre!("Deletion {:?} overlaps with deletion {:?}", del_i, del_j)); + } + } + } + + // === insertion checks === + for ins in &self.inss { + if ins.pos > len { + return Err(eyre!( + "Insertion position {} is out of bounds for sequence of length {}", + ins.pos, + len + )); + } + } + + Ok(()) + } } #[cfg(test)] diff --git a/packages/pangraph/src/pangraph/graph_merging.rs b/packages/pangraph/src/pangraph/graph_merging.rs index e09db2b4..1187b777 100644 --- a/packages/pangraph/src/pangraph/graph_merging.rs +++ b/packages/pangraph/src/pangraph/graph_merging.rs @@ -58,6 +58,9 @@ pub fn merge_graphs( debug!("Removing transitive edges"); remove_transitive_edges(&mut graph).wrap_err("When removing transitive edges")?; + #[cfg(any(test, debug_assertions))] + graph.sanity_check().wrap_err("After merging graphs")?; + Ok(graph) } diff --git a/packages/pangraph/src/pangraph/pangraph.rs b/packages/pangraph/src/pangraph/pangraph.rs index 67f6d25b..37cfe7bb 100644 --- a/packages/pangraph/src/pangraph/pangraph.rs +++ b/packages/pangraph/src/pangraph/pangraph.rs @@ -107,6 +107,81 @@ impl Pangraph { } } } + + #[cfg(any(test, debug_assertions))] + pub fn sanity_check(&self) -> Result<(), Report> { + for (node_id, node) in &self.nodes { + if !self.blocks.contains_key(&node.block_id()) { + return Err(eyre::eyre!("Block {} not found in graph", node.block_id())); + } + let block = &self.blocks[&node.block_id()]; + + if !self.paths.contains_key(&node.path_id()) { + return Err(eyre::eyre!("Path {} not found in graph", node.path_id())); + } + let path = &self.paths[&node.path_id()]; + + if !block.alignments().contains_key(node_id) { + return Err(eyre::eyre!("Node {} not found in block {}", node_id, block.id())); + } + + if !path.nodes.contains(node_id) { + return Err(eyre::eyre!("Node {} not found in path {}", node_id, path.id())); + } + } + + for (block_id, block) in &self.blocks { + if block.alignments().is_empty() { + return Err(eyre::eyre!("Block {} has no nodes", block_id)); + } + + for node_id in block.alignments().keys() { + if !self.nodes.contains_key(node_id) { + return Err(eyre::eyre!("Node {} not found in graph", node_id)); + } + } + } + + for (path_id, path) in &self.paths { + for node_id in &path.nodes { + if !self.nodes.contains_key(node_id) { + return Err(eyre::eyre!("Node {node_id} from path {path_id} not found in graph")); + } + } + + // // check that there are no duplicated node ids + // // currently disabled because this could rarely happen for empty nodes + // let mut seen = BTreeSet::new(); + // for node_id in &path.nodes { + // if !seen.insert(node_id) { + // return Err(eyre::eyre!("Node {node_id} appears more than once in path {path_id}",)); + // } + // } + + // check that nodes in the same path have contiguous positions + let mut prev_pos = self.nodes[path.nodes.first().unwrap()].position().1; + for &node_id in &path.nodes[1..] { + let pos = self.nodes[&node_id].position().0; + if pos != prev_pos { + return Err(eyre::eyre!( + "Node {node_id} in path {path_id} has position {pos} but previous node has position {prev_pos}", + )); + } + prev_pos = self.nodes[&node_id].position().1; + } + if path.circular() { + let first_pos = self.nodes[path.nodes.first().unwrap()].position().0; + let last_pos = self.nodes[path.nodes.last().unwrap()].position().1; + if first_pos != last_pos { + return Err(eyre::eyre!( + "Circular path {path_id} has first node position {first_pos} different from last node position {last_pos}", + )); + } + } + } + + Ok(()) + } } impl FromStr for Pangraph { diff --git a/packages/pangraph/src/pangraph/pangraph_node.rs b/packages/pangraph/src/pangraph/pangraph_node.rs index b895b7e0..a6357c5a 100644 --- a/packages/pangraph/src/pangraph/pangraph_node.rs +++ b/packages/pangraph/src/pangraph/pangraph_node.rs @@ -41,12 +41,12 @@ impl PangraphNode { } } - pub fn len(&self) -> usize { - self.position.1.saturating_sub(self.position.1) - } - - pub fn is_empty(&self) -> bool { - self.len() == 0 + // this is almost equivalent to checking if the node is empty + // except for an edge case: when a circular path contains only + // one node. In this case even if the node is not empty, the + // start and end are the same. + pub fn start_is_end(&self) -> bool { + self.position.0 == self.position.1 } } diff --git a/packages/pangraph/src/pangraph/reweave.rs b/packages/pangraph/src/pangraph/reweave.rs index 68ee56fd..96a1e4bc 100644 --- a/packages/pangraph/src/pangraph/reweave.rs +++ b/packages/pangraph/src/pangraph/reweave.rs @@ -49,6 +49,9 @@ impl MergePromise { map_variations(self.anchor_block.consensus(), seq)? }; + #[cfg(any(test, debug_assertions))] + edits.sanity_check(self.anchor_block.consensus().len())?; + Ok((*node_id, edits)) }) .collect::, Report>>()? diff --git a/packages/pangraph/src/pangraph/slice.rs b/packages/pangraph/src/pangraph/slice.rs index f8dc14b1..6c4a7c11 100644 --- a/packages/pangraph/src/pangraph/slice.rs +++ b/packages/pangraph/src/pangraph/slice.rs @@ -116,6 +116,9 @@ pub fn block_slice( let mut new_alignment = BTreeMap::new(); for (old_node_id, edits) in b.alignments() { + #[cfg(any(debug_assertions, test))] + edits.sanity_check(b.consensus_len()).unwrap(); + let old_node = &G.nodes[old_node_id]; let old_strandedness = old_node.strand(); @@ -137,6 +140,10 @@ pub fn block_slice( node_updates.insert(*old_node_id, new_node.clone()); let new_edits = slice_edits(i, edits, block_L); + + #[cfg(any(debug_assertions, test))] + new_edits.sanity_check(new_consensus.len()).unwrap(); + let ovw = new_alignment.insert(new_node.id(), new_edits); debug_assert!(ovw.is_none()); // new node id is not already in new_alignment } diff --git a/packages/pangraph/src/reconsensus/reconsensus.rs b/packages/pangraph/src/reconsensus/reconsensus.rs index 5c47eb02..83d1f6e1 100644 --- a/packages/pangraph/src/reconsensus/reconsensus.rs +++ b/packages/pangraph/src/reconsensus/reconsensus.rs @@ -45,6 +45,10 @@ fn reconsensus(block: &mut PangraphBlock) -> Result<(), Report> { if !ins.is_empty() || !dels.is_empty() { let consensus = block.consensus(); let consensus = apply_indels(consensus, &dels, &ins); + + // debug assert: consensus is not empty + debug_assert!(!consensus.is_empty(), "Consensus is empty after indels"); + update_block_consensus(block, &consensus)?; } Ok(()) @@ -87,8 +91,11 @@ fn reconsensus_mutations(block: &mut PangraphBlock) -> Result<(), Report> { let subs_at_pos: Vec<_> = edit.subs.iter().filter(|s| s.pos == pos).cloned().collect(); match subs_at_pos.len() { 0 => { - edit.subs.push(Sub::new(pos, original)); - edit.subs.sort_by_key(|s| s.pos); + // if the position is not in a deletion, append the mutation + if !edit.dels.iter().any(|d| d.contains(pos)) { + edit.subs.push(Sub::new(pos, original)); + edit.subs.sort_by_key(|s| s.pos); + } } 1 => { let s = &subs_at_pos[0]; @@ -177,6 +184,22 @@ fn update_block_consensus(block: &mut PangraphBlock, consensus: impl Into, Report>>()?; + // debug assets: all sequences are non-empty + #[cfg(any(debug_assertions, test))] + { + for (nid, seq) in &seqs { + if seq.is_empty() { + return make_error!( + "node is empty!\nblock: {}\nnode: {}\nedits: {:?}\nconsensus: {}", + block.id(), + nid, + block.alignments().get(nid), + block.consensus() + ); + } + } + } + // Re-align sequences let alignments = seqs .into_iter() @@ -203,14 +226,14 @@ mod tests { // 01234567890123456789012 // node 1) .T...--..........A..... // node 2) .T...--...C......|..... - // node 3) .T...--...C............ + // node 3) .T...--...C.....--..... // node 4) .C.......---.....A..... // node 5) .....|...........A..... // L = 23, N = 5 let aln = btreemap! { NodeId(1) => Edit::new(vec![], vec![Del::new(5, 2)], vec![Sub::new(1, 'T'), Sub::new(17, 'A')]), NodeId(2) => Edit::new(vec![Ins::new(17, "CAAT")], vec![Del::new(5, 2)], vec![Sub::new(1, 'T'), Sub::new(10, 'C')]), - NodeId(3) => Edit::new(vec![], vec![Del::new(5, 2)], vec![Sub::new(1, 'T'), Sub::new(10, 'C')]), + NodeId(3) => Edit::new(vec![], vec![Del::new(5, 2), Del::new(16,2)], vec![Sub::new(1, 'T'), Sub::new(10, 'C')]), NodeId(4) => Edit::new(vec![], vec![Del::new(9, 3)], vec![Sub::new(1, 'C'), Sub::new(17, 'A')]), NodeId(5) => Edit::new(vec![], vec![Del::new(5, 2)], vec![Sub::new(17, 'A')]), }; @@ -223,14 +246,14 @@ mod tests { // 01234567890123456789012 // node 1) .....--................ // node 2) .....--...C......G|.... - // node 3) .....--...C......G..... + // node 3) .....--...C.....--..... // node 4) .C.......---........... // node 5) .G...|................. // L = 23, N = 5 let aln = btreemap! { NodeId(1) => Edit::new(vec![], vec![Del::new(5, 2)], vec![]), NodeId(2) => Edit::new(vec![Ins::new(17, "CAAT")], vec![Del::new(5, 2)], vec![Sub::new(10, 'C'), Sub::new(17, 'G')]), - NodeId(3) => Edit::new(vec![], vec![Del::new(5, 2)], vec![Sub::new(10, 'C'), Sub::new(17, 'G')]), + NodeId(3) => Edit::new(vec![], vec![Del::new(5, 2), Del::new(16,2)], vec![Sub::new(10, 'C')]), NodeId(4) => Edit::new(vec![], vec![Del::new(9, 3)], vec![Sub::new(1, 'C')]), NodeId(5) => Edit::new(vec![], vec![Del::new(5, 2)], vec![Sub::new(1, 'G')]), }; diff --git a/packages/pangraph/src/reconsensus/remove_nodes.rs b/packages/pangraph/src/reconsensus/remove_nodes.rs index f305a331..13a7825e 100644 --- a/packages/pangraph/src/reconsensus/remove_nodes.rs +++ b/packages/pangraph/src/reconsensus/remove_nodes.rs @@ -1,3 +1,5 @@ +use log::info; + use crate::pangraph::pangraph::Pangraph; use crate::pangraph::pangraph_block::BlockId; use crate::pangraph::pangraph_node::NodeId; @@ -21,13 +23,38 @@ fn find_empty_nodes(graph: &Pangraph, block_ids: &[BlockId]) -> Vec { for &block_id in block_ids { let block = &graph.blocks[&block_id]; let cons_len = block.consensus_len(); + + debug_assert!( + cons_len > 0, + "Block {block_id} has a consensus length of 0 and should have been removed", + ); + for (&node_id, edits) in block.alignments() { + // check that edits are non-overlapping and well-defined + #[cfg(any(test, debug_assertions))] + edits.sanity_check(cons_len).unwrap(); + + // if the node has insertions or substitutions, or the total deletion size is less than the consensus length + // then it is not empty if !edits.inss.is_empty() || !edits.subs.is_empty() || edits.dels.is_empty() { + // check that the node is not empty + debug_assert!( + !graph.nodes[&node_id].start_is_end(), + "Node {node_id} with edits {edits:?} and consensus length {cons_len} is empty and should have been removed", + ); + continue; } + + // if the node has no insertions or substitutions and the total deletion size is equal to the consensus length + // then it is empty and should be removed if edits.dels.iter().map(|d| d.len).sum::() == cons_len { + info!( + "empty node {} with edits {:?} and consensus length {} is empty: to be removed", + node_id, edits, cons_len + ); + debug_assert!(graph.nodes[&node_id].start_is_end()); node_ids_to_delete.push(node_id); - debug_assert!(graph.nodes[&node_id].is_empty()); } } } @@ -47,10 +74,19 @@ fn remove_nodes_from_graph(graph: &mut Pangraph, node_ids: &[NodeId]) { // remove from node dictionary graph.nodes.remove(&node_id); - // remove from path + // === old implementation. Assumes only one node to remove per path === + // === but there currently is an edge case where multiple empty nodes with the same id could be present === + // // remove from path + // let path_nodes = &mut graph.paths.get_mut(&path_id).unwrap().nodes; + // let node_idx = path_nodes.iter().position(|&n| n == node_id).unwrap(); + // path_nodes.remove(node_idx); + + // remove all nodes with the same id let path_nodes = &mut graph.paths.get_mut(&path_id).unwrap().nodes; - let node_idx = path_nodes.iter().position(|&n| n == node_id).unwrap(); - path_nodes.remove(node_idx); + // while there is a node with this id, remove it: + while let Some(node_idx) = path_nodes.iter().position(|&n| n == node_id) { + path_nodes.remove(node_idx); + } // remove from block alignment let _removed = graph.blocks.get_mut(&block_id).unwrap().alignment_remove(node_id);