diff --git a/Cargo.lock b/Cargo.lock index abddb3b9..23cd04ab 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -122,6 +122,12 @@ dependencies = [ "num-traits", ] +[[package]] +name = "arrayvec" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b" + [[package]] name = "atty" version = "0.2.14" @@ -274,6 +280,15 @@ version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" +[[package]] +name = "bstr" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" +dependencies = [ + "memchr", +] + [[package]] name = "bumpalo" version = "3.16.0" @@ -301,6 +316,20 @@ name = "bytemuck" version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94bbb0ad554ad961ddc5da507a12a29b14e4ae5bda06b19f575a3e6079d2e2ae" +dependencies = [ + "bytemuck_derive", +] + +[[package]] +name = "bytemuck_derive" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bcfcc3cd946cb52f0bbfdbbcfa2f4e24f75ebb6c0e1002f7c25904fada18b9ec" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.77", +] [[package]] name = "byteorder" @@ -361,7 +390,7 @@ version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" dependencies = [ - "nom", + "nom 7.1.3", ] [[package]] @@ -1006,6 +1035,24 @@ dependencies = [ "syn 2.0.77", ] +[[package]] +name = "gfa" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9632601a032d2ae43f5050b454dd27c2add69b2e52bd46a4888f3aeb6e3629f5" +dependencies = [ + "anyhow", + "bstr", + "bytemuck", + "fnv", + "lazy_static", + "memmap", + "nom 5.1.3", + "regex", + "serde", + "serde_json", +] + [[package]] name = "gimli" version = "0.28.1" @@ -1111,6 +1158,12 @@ dependencies = [ "hashbrown 0.14.5", ] +[[package]] +name = "indoc" +version = "2.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" + [[package]] name = "intervallum" version = "1.4.1" @@ -1189,6 +1242,19 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +[[package]] +name = "lexical-core" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6607c62aa161d23d17a9072cc5da0be67cdfc89d3afb1e8d9c842bebc2525ffe" +dependencies = [ + "arrayvec", + "bitflags 1.3.2", + "cfg-if", + "ryu", + "static_assertions", +] + [[package]] name = "lexical-core" version = "0.8.5" @@ -1353,6 +1419,16 @@ version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" +[[package]] +name = "memmap" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b" +dependencies = [ + "libc", + "winapi", +] + [[package]] name = "memoize" version = "0.4.2" @@ -1573,6 +1649,17 @@ dependencies = [ "num-traits", ] +[[package]] +name = "nom" +version = "5.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08959a387a676302eebf4ddbcbc611da04285579f76f88ee0506c63b1a61dd4b" +dependencies = [ + "lexical-core 0.7.6", + "memchr", + "version_check", +] + [[package]] name = "nom" version = "7.1.3" @@ -1632,7 +1719,7 @@ checksum = "2b94966806ac7aec118d41eea7080bfbd0e8b843ba64f46522c57f0f55cfb1f0" dependencies = [ "bitflags 2.6.0", "indexmap", - "lexical-core", + "lexical-core 0.8.5", "memchr", "noodles-bgzf", "noodles-core", @@ -1790,6 +1877,8 @@ dependencies = [ "flate2", "gcollections", "getset", + "gfa", + "indoc", "intervallum", "itertools", "lazy_static", diff --git a/Cargo.toml b/Cargo.toml index c916c99b..ae613616 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,6 +48,8 @@ eyre = "=0.6.12" flate2 = "=1.0.33" gcollections = "=1.5.0" getset = "0.1.3" +gfa = { version = "=0.10.1", features = ["serde", "serde_json"] } +indoc = "=2.0.5" intervallum = "=1.4.1" itertools = "=0.13.0" lazy_static = "=1.5.0" diff --git a/packages/pangraph/Cargo.toml b/packages/pangraph/Cargo.toml index c8c43050..78668403 100644 --- a/packages/pangraph/Cargo.toml +++ b/packages/pangraph/Cargo.toml @@ -37,6 +37,8 @@ eyre = { workspace = true } flate2 = { workspace = true } gcollections = { workspace = true } getset = { workspace = true } +gfa = { workspace = true } +indoc = { workspace = true } intervallum = { workspace = true } itertools = { workspace = true } lazy_static = { workspace = true } diff --git a/packages/pangraph/src/commands/export/export_args.rs b/packages/pangraph/src/commands/export/export_args.rs index a427b8df..3971082b 100644 --- a/packages/pangraph/src/commands/export/export_args.rs +++ b/packages/pangraph/src/commands/export/export_args.rs @@ -56,29 +56,17 @@ pub struct PangraphExportArgs { #[clap(value_hint = ValueHint::Other)] pub maximum_depth: usize, - /// Path to directory where output will be stored - #[clap(long, short = 'o', default_value = "export")] - #[clap(value_hint = ValueHint::DirPath)] - pub output_directory: PathBuf, - - /// Basename of files - #[clap(long, short = 'p', default_value = "pangraph")] - #[clap(value_hint = ValueHint::DirPath)] - pub prefix: String, - - /// Do not emit GFA file - #[clap(long, alias = "ng")] - pub no_export_gfa: bool, - - /// Emit vis directory to input to panX-visualization - #[clap(long, alias = "px")] - pub export_panx: bool, - /// Do not export any block that contains at least one strain more than once #[clap(long, alias = "nd")] pub no_duplications: bool, - /// Random seed - #[clap(long)] - pub seed: Option, + /// Path to output GFA file + #[clap(long, alias = "gfa")] + #[clap(value_hint = ValueHint::Other)] + pub output_gfa: Option, + + /// Path to output directory where PanX visualization files will be written + #[clap(long, alias = "panx")] + #[clap(value_hint = ValueHint::DirPath)] + pub output_panx: Option, } diff --git a/packages/pangraph/src/commands/export/export_run.rs b/packages/pangraph/src/commands/export/export_run.rs index 95857d25..89333698 100644 --- a/packages/pangraph/src/commands/export/export_run.rs +++ b/packages/pangraph/src/commands/export/export_run.rs @@ -1,9 +1,10 @@ use crate::commands::export::export_args::PangraphExportArgs; +use crate::io::gfa::gfa_write_file; +use crate::make_error; use crate::pangraph::pangraph::Pangraph; -use crate::utils::random::get_random_number_generator; use eyre::Report; -pub fn export_run(args: &PangraphExportArgs) -> Result<(), Report> { +pub fn export_run(args: PangraphExportArgs) -> Result<(), Report> { let PangraphExportArgs { input_json, edge_minimum_length, @@ -14,17 +15,25 @@ pub fn export_run(args: &PangraphExportArgs) -> Result<(), Report> { maximum_length, minimum_depth, maximum_depth, - output_directory, - prefix, - no_export_gfa, - export_panx, no_duplications, - seed, - } = &args; + output_gfa, + output_panx, + } = args; - let rng = get_random_number_generator(seed); + if [&output_gfa, &output_panx].iter().all(|o| o.is_none()) { + return make_error!("No output formats specified. Specify at least one output path."); + } - let pangraph_json = Pangraph::from_path(input_json)?; + let graph = Pangraph::from_path(&input_json)?; + + if let Some(output_gfa) = output_gfa { + gfa_write_file(output_gfa, &graph)?; + } + + if let Some(output_panx) = output_panx { + // TODO + // panx_write(output_panx, &graph)?; + } Ok(()) } diff --git a/packages/pangraph/src/commands/main.rs b/packages/pangraph/src/commands/main.rs index ff2890b1..21a0ae8e 100644 --- a/packages/pangraph/src/commands/main.rs +++ b/packages/pangraph/src/commands/main.rs @@ -17,7 +17,7 @@ pub fn pangraph_main() -> Result<(), Report> { match args.command { PangraphCommands::Build(args) => build_run(&args), - PangraphCommands::Export(args) => export_run(&args), + PangraphCommands::Export(args) => export_run(args), PangraphCommands::Simplify(args) => simplify_run(args), PangraphCommands::Reconstruct(args) => reconstruct_run(&args), PangraphCommands::Schema(args) => generate_schema(&args), diff --git a/packages/pangraph/src/io/gfa.rs b/packages/pangraph/src/io/gfa.rs new file mode 100644 index 00000000..36d0f8b7 --- /dev/null +++ b/packages/pangraph/src/io/gfa.rs @@ -0,0 +1,212 @@ +use crate::io::file::create_file_or_stdout; +use crate::io::write::WriteAdapterIoToFmt; +use crate::o; +use crate::pangraph::pangraph::Pangraph; +use eyre::{Context, Report}; +use gfa::gfa::Orientation::{Backward, Forward}; +use gfa::gfa::{Link, Path as GFAPath, Segment, GFA}; +use gfa::optfields::OptionalFields; +use rayon::prelude::*; +use std::io::Write; +use std::path::Path; + +pub fn gfa_write_file(filepath: impl AsRef, g: &Pangraph) -> Result<(), Report> { + let filepath = filepath.as_ref(); + gfa_write(create_file_or_stdout(filepath)?, g).wrap_err_with(|| format!("When writing gfa file: {filepath:#?}")) +} + +pub fn gfa_write_str(g: &Pangraph) -> Result { + let mut buf = vec![]; + gfa_write(&mut buf, g)?; + Ok(String::from_utf8(buf)?) +} + +pub fn gfa_write(writer: W, g: &Pangraph) -> Result<(), Report> { + let gfa = convert_pangraph_to_gfa(g)?; + gfa::writer::write_gfa(&gfa, &mut WriteAdapterIoToFmt(writer)); + Ok(()) +} + +fn convert_pangraph_to_gfa(pangraph: &Pangraph) -> Result, OptionalFields>, Report> { + let mut gfa = GFA::, OptionalFields>::new(); + + gfa.segments = pangraph + .blocks + .par_iter() + .map(|(_, block)| Segment { + name: block.id().to_string().as_bytes().to_vec(), + sequence: block.consensus().as_bytes().to_vec(), + optional: OptionalFields::new(), + }) + .collect(); + + gfa.links = pangraph + .nodes + .par_iter() + .map(|(_, node)| { + // FIXME: bogus data + Link::, OptionalFields> { + from_segment: o!("A").as_bytes().to_vec(), + from_orient: Backward, + to_segment: o!("B").as_bytes().to_vec(), + to_orient: Forward, + overlap: b"1M".to_vec(), + optional: OptionalFields::new(), + } + }) + .collect(); + + gfa.paths = pangraph + .paths + .par_iter() + .map(|(_, path)| { + let segment_names = path + .nodes + .iter() + .map(|node_id| pangraph.nodes[node_id].block_id().to_string()) + .collect::>(); + + let path_name = path + .name + .clone() + .unwrap_or_else(|| path.id().to_string()) + .as_bytes() + .to_vec(); + + let segment_names = segment_names.join(",").as_bytes().to_vec(); + let overlaps = vec![None; segment_names.len() - 1]; + let optional = OptionalFields::new(); + + GFAPath::new(path_name, segment_names, overlaps, optional) + }) + .collect(); + + Ok(gfa) +} + +#[cfg(test)] +mod tests { + use super::*; + use indoc::indoc; + use pretty_assertions::assert_eq; + use std::str::FromStr; + + #[test] + fn test_gfa_empty() { + let actual = gfa_write_str(&Pangraph::default()).unwrap(); + let expected = indoc! {r#" + H VN:Z:1.0 + "#}; + assert_eq!(actual, expected); + } + + #[test] + fn test_gfa_general_case() { + let g = Pangraph::from_str(indoc! { + // language=json + r#" + { + "paths": { + "0": { + "id": 0, + "nodes": [ + 14515840915932838377 + ], + "tot_len": 1737, + "circular": false, + "name": "Path A" + }, + "1": { + "id": 1, + "nodes": [ + 15291847754458130853 + ], + "tot_len": 1737, + "circular": false, + "name": "Path B" + }, + "2": { + "id": 2, + "nodes": [ + 15109482180931348145 + ], + "tot_len": 1737, + "circular": false, + "name": "Path C" + } + }, + "blocks": { + "12778560093473594666": { + "id": 12778560093473594666, + "consensus": "ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT", + "alignments": { + "14515840915932838377": { + "subs": [], + "dels": [], + "inss": [] + }, + "15291847754458130853": { + "subs": [], + "dels": [], + "inss": [] + }, + "15109482180931348145": { + "subs": [], + "dels": [], + "inss": [] + } + } + } + }, + "nodes": { + "14515840915932838377": { + "id": 14515840915932838377, + "block_id": 12778560093473594666, + "path_id": 0, + "strand": "+", + "position": [ + 0, + 0 + ] + }, + "15291847754458130853": { + "id": 15291847754458130853, + "block_id": 12778560093473594666, + "path_id": 1, + "strand": "+", + "position": [ + 0, + 0 + ] + }, + "15109482180931348145": { + "id": 15109482180931348145, + "block_id": 12778560093473594666, + "path_id": 2, + "strand": "+", + "position": [ + 0, + 0 + ] + } + } + } + "#}) + .unwrap(); + + let actual = gfa_write_str(&g).unwrap(); + + let expected = indoc! {r#" + H VN:Z:1.0 + S 12778560093473594666 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT + L A - B + 1M + L A - B + 1M + L A - B + 1M + P Path A 12778560093473594666 *,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,* + P Path B 12778560093473594666 *,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,* + P Path C 12778560093473594666 *,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,* + "#}; + + assert_eq!(actual, expected); + } +} diff --git a/packages/pangraph/src/io/mod.rs b/packages/pangraph/src/io/mod.rs index 27723987..d1767208 100644 --- a/packages/pangraph/src/io/mod.rs +++ b/packages/pangraph/src/io/mod.rs @@ -4,7 +4,9 @@ pub mod csv; pub mod fasta; pub mod file; pub mod fs; +pub mod gfa; pub mod json; pub mod json_schema; pub mod seq; +pub mod write; pub mod yaml; diff --git a/packages/pangraph/src/io/write.rs b/packages/pangraph/src/io/write.rs new file mode 100644 index 00000000..6b8edecf --- /dev/null +++ b/packages/pangraph/src/io/write.rs @@ -0,0 +1,19 @@ +/// Adapt `std::fmt::Write` to `std::io::Write` +pub struct WriteAdapterFmtToIo(pub W); + +impl std::fmt::Write for WriteAdapterFmtToIo { + fn write_str(&mut self, s: &str) -> std::fmt::Result { + #[allow(clippy::map_err_ignore)] + self.0.write_all(s.as_bytes()).map_err(|_| std::fmt::Error)?; + Ok(()) + } +} + +/// Adapt `std::io::Write` to `std::fmt::Write` +pub struct WriteAdapterIoToFmt(pub W); + +impl std::fmt::Write for WriteAdapterIoToFmt { + fn write_str(&mut self, s: &str) -> std::fmt::Result { + self.0.write_all(s.as_bytes()).map_err(|_| std::fmt::Error) + } +}