diff --git a/Cargo.lock b/Cargo.lock index 8f328b5..4389ff8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -777,8 +777,10 @@ dependencies = [ name = "pykmertools" version = "0.1.2" dependencies = [ + "clap", "composition", "kmer", + "kmertools", "pyo3", "rayon", ] diff --git a/README.md b/README.md index 787fa01..9b2bdf3 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ [![Conda](https://img.shields.io/conda/v/bioconda/kmertools)](https://anaconda.org/bioconda/kmertools) [![Conda](https://img.shields.io/conda/dn/bioconda/kmertools)](https://anaconda.org/bioconda/kmertools) [![codecov](https://codecov.io/gh/anuradhawick/kmertools/graph/badge.svg?token=IDGRE54SSQ)](https://codecov.io/gh/anuradhawick/kmertools) +![PyPI - Version](https://img.shields.io/pypi/v/pykmertools)
@@ -25,7 +26,11 @@ $$ | \$$\ $$ | $$ | $$ |\$$$$$$$\ $$ | $$ | \$$$$$$ |\$$$$$$ |$$ |$$ `kmertools` is a k-mer based feature extraction tool designed to support metagenomics and other bioinformatics analytics. This tool leverages k-mer analysis to vectorize DNA sequences, facilitating the use of these vectors in various AI/ML applications. -**NEW:** `kmertools` is now available on bioconda at [https://anaconda.org/bioconda/kmertools](https://anaconda.org/bioconda/kmertools). +**NEW** + +* `kmertools` is now available on BioConda at [https://anaconda.org/bioconda/kmertools](https://anaconda.org/bioconda/kmertools). +* `kmertools` is now available on PyPI at [https://pypi.org/project/pykmertools/](https://pypi.org/project/pykmertools/). +* `kmertools` now provide python bindings. Read more in our [Wiki](https://github.com/anuradhawick/kmertools/wiki). ## Features @@ -33,6 +38,7 @@ $$ | \$$\ $$ | $$ | $$ |\$$$$$$$\ $$ | $$ | \$$$$$$ |\$$$$$$ |$$ |$$ - **Minimiser Binning:** Efficiently bin sequences using minimisers to reduce data complexity. - **Chaos Game Representation (CGR):** Compute CGR vectors for DNA sequences based on k-mers or whole sequence transformation. - **Coverage Histograms:** Create coverage histograms to analyze the depth of sequencing reads. +- **Python Binding:** You can import kmertools functionality using `import pykmertools as kt` ## Installation @@ -47,8 +53,15 @@ conda create -n kmertools -c bioconda kmertools # activate environment conda activate kmertools ``` +### Option 2: from PyPI + +You can install `kmertools` from PyPI at https://pypi.org/project/pykmertools/. + +```bash +pip install pykmertools +``` -### Option 2: from sources +### Option 3: from sources You can install `kmertools` directly from the source by cloning the repository and using Rust's package manager `cargo`. diff --git a/kmer/src/kmer_minimisers.rs b/kmer/src/kmer_minimisers.rs index 3808cb8..d9f82f0 100644 --- a/kmer/src/kmer_minimisers.rs +++ b/kmer/src/kmer_minimisers.rs @@ -3,6 +3,7 @@ use std::cmp::min; use std::collections::VecDeque; use std::iter::Iterator; +// TODO I cannot remember why I made this, probably a test script, but well. // https://github.com/lh3/minimap2/blob/0cc3cdca27f050fb80a19c90d25ecc6ab0b0907b/sketch.c#L9C1-L26C3 const SEQ_NT4_TABLE: [u8; 256] = [ 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, diff --git a/kmer/src/lib.rs b/kmer/src/lib.rs index 60ca5da..2342f1a 100644 --- a/kmer/src/lib.rs +++ b/kmer/src/lib.rs @@ -1,5 +1,4 @@ pub mod kmer; -pub mod kmer_minimisers; pub mod minimiser; pub type Kmer = u64; diff --git a/kmertools/src/args.rs b/kmertools/src/args.rs index 879df59..1eba8bb 100644 --- a/kmertools/src/args.rs +++ b/kmertools/src/args.rs @@ -1,4 +1,8 @@ use clap::{Args, Parser, Subcommand, ValueEnum}; +use composition::{cgr::CgrComputer, oligo::OligoComputer, oligocgr::OligoCgrComputer}; +use coverage::CovComputer; +use ktio::fops::create_directory; +use misc::minimisers; const ABOUT: &str = "kmertools: DNA vectorisation @@ -226,3 +230,131 @@ pub struct CounterCommand { #[arg(short, long, default_value_t = 0)] pub threads: usize, } + +#[cfg(not(tarpaulin_include))] +pub fn cli(cli: Cli) { + match cli.command { + Commands::Comp { command } => match command { + CompositionCommands::Oligo(command) => { + let mut com = + OligoComputer::new(command.input, command.output, command.k_size as usize); + if command.threads > 0 { + com.set_threads(command.threads); + } + com.set_norm(!command.counts); + com.set_header(command.header); + + match command.preset { + VecFmtPreset::Csv => com.set_delim(",".to_owned()), + VecFmtPreset::Spc => com.set_delim(" ".to_owned()), + VecFmtPreset::Tsv => com.set_delim("\t".to_owned()), + } + if let Err(e) = com.vectorise() { + eprintln!("Error: {}", e); + } + } + CompositionCommands::Cgr(command) => { + if let Some(ksize) = command.k_size { + let vecsize = command + .vec_size + .unwrap_or((ksize as f64).powf(4.0).powf(0.5) as u64) + as usize; + let mut cgr = OligoCgrComputer::new( + command.input, + command.output, + ksize as usize, + vecsize, + ); + if command.threads > 0 { + cgr.set_threads(command.threads); + } + cgr.set_norm(!command.counts); + if let Err(e) = cgr.vectorise() { + eprintln!("Error: {}", e); + } + } else { + if command.counts { + eprintln!("Error: cannot use counts in whole sequence CGR!"); + return; + } + let vecsize = command.vec_size.unwrap_or(1) as usize; + let mut cgr = CgrComputer::new(command.input, command.output, vecsize); + if command.threads > 0 { + cgr.set_threads(command.threads); + } + if let Err(e) = cgr.vectorise() { + eprintln!("Error: {}", e); + } + } + } + }, + Commands::Cov(command) => { + create_directory(&command.output).unwrap(); + let mut cov = CovComputer::new( + command.input, + command.output, + command.k_size as usize, + command.bin_size as usize, + command.bin_count as usize, + ); + if command.threads > 0 { + cov.set_threads(command.threads); + } + if let Some(path) = command.alt_input { + cov.set_kmer_path(path); + } + if command.counts { + cov.set_norm(false); + } + cov.set_max_memory(command.memory as f64); + match command.preset { + VecFmtPreset::Csv => cov.set_delim(",".to_owned()), + VecFmtPreset::Spc => cov.set_delim(" ".to_owned()), + VecFmtPreset::Tsv => cov.set_delim("\t".to_owned()), + } + cov.build_table().unwrap(); + cov.compute_coverages(); + } + Commands::Min(command) => { + if command.w_size <= command.m_size && command.w_size > 0 { + eprintln!("Window size must be longer than minimiser size!"); + return; + } + if command.m_size >= 31 { + eprintln!("Minimisers longer than 30 bases not allowed!"); + return; + } + + match command.preset { + MinFmtPreset::M2s => minimisers::bin_sequences( + command.w_size as usize, + command.m_size as usize, + &command.input, + &command.output, + command.threads, + ), + MinFmtPreset::S2m => minimisers::seq_to_min( + command.w_size as usize, + command.m_size as usize, + &command.input, + &command.output, + command.threads, + ), + } + } + Commands::Ctr(command) => { + create_directory(&command.output).unwrap(); + let mut ctr = + counter::CountComputer::new(command.input, command.output, command.k_size as usize); + if command.threads > 0 { + ctr.set_threads(command.threads); + } + if command.acgt { + ctr.set_acgt_output(true); + } + ctr.set_max_memory(command.memory as f64); + ctr.count(); + ctr.merge(true); + } + } +} diff --git a/kmertools/src/lib.rs b/kmertools/src/lib.rs new file mode 100644 index 0000000..6e10f4a --- /dev/null +++ b/kmertools/src/lib.rs @@ -0,0 +1 @@ +pub mod args; diff --git a/kmertools/src/main.rs b/kmertools/src/main.rs index aefdc18..017f36a 100644 --- a/kmertools/src/main.rs +++ b/kmertools/src/main.rs @@ -1,139 +1,9 @@ -use args::CompositionCommands; +use args::Cli; use clap::Parser; -use composition::oligo::OligoComputer; -use coverage::CovComputer; -use misc::minimisers; mod args; #[cfg(not(tarpaulin_include))] fn main() { - use composition::{cgr::CgrComputer, oligocgr::OligoCgrComputer}; - use ktio::fops::create_directory; - - let cli = args::Cli::parse(); - - match cli.command { - args::Commands::Comp { command } => match command { - CompositionCommands::Oligo(command) => { - let mut com = - OligoComputer::new(command.input, command.output, command.k_size as usize); - if command.threads > 0 { - com.set_threads(command.threads); - } - com.set_norm(!command.counts); - com.set_header(command.header); - - match command.preset { - args::VecFmtPreset::Csv => com.set_delim(",".to_owned()), - args::VecFmtPreset::Spc => com.set_delim(" ".to_owned()), - args::VecFmtPreset::Tsv => com.set_delim("\t".to_owned()), - } - if let Err(e) = com.vectorise() { - eprintln!("Error: {}", e); - } - } - CompositionCommands::Cgr(command) => { - if let Some(ksize) = command.k_size { - let vecsize = command - .vec_size - .unwrap_or((ksize as f64).powf(4.0).powf(0.5) as u64) - as usize; - let mut cgr = OligoCgrComputer::new( - command.input, - command.output, - ksize as usize, - vecsize, - ); - if command.threads > 0 { - cgr.set_threads(command.threads); - } - cgr.set_norm(!command.counts); - if let Err(e) = cgr.vectorise() { - eprintln!("Error: {}", e); - } - } else { - if command.counts { - eprintln!("Error: cannot use counts in whole sequence CGR!"); - return; - } - let vecsize = command.vec_size.unwrap_or(1) as usize; - let mut cgr = CgrComputer::new(command.input, command.output, vecsize); - if command.threads > 0 { - cgr.set_threads(command.threads); - } - if let Err(e) = cgr.vectorise() { - eprintln!("Error: {}", e); - } - } - } - }, - args::Commands::Cov(command) => { - create_directory(&command.output).unwrap(); - let mut cov = CovComputer::new( - command.input, - command.output, - command.k_size as usize, - command.bin_size as usize, - command.bin_count as usize, - ); - if command.threads > 0 { - cov.set_threads(command.threads); - } - if let Some(path) = command.alt_input { - cov.set_kmer_path(path); - } - if command.counts { - cov.set_norm(false); - } - cov.set_max_memory(command.memory as f64); - match command.preset { - args::VecFmtPreset::Csv => cov.set_delim(",".to_owned()), - args::VecFmtPreset::Spc => cov.set_delim(" ".to_owned()), - args::VecFmtPreset::Tsv => cov.set_delim("\t".to_owned()), - } - cov.build_table().unwrap(); - cov.compute_coverages(); - } - args::Commands::Min(command) => { - if command.w_size <= command.m_size && command.w_size > 0 { - eprintln!("Window size must be longer than minimiser size!"); - return; - } - if command.m_size >= 31 { - eprintln!("Minimisers longer than 30 bases not allowed!"); - return; - } - - match command.preset { - args::MinFmtPreset::M2s => minimisers::bin_sequences( - command.w_size as usize, - command.m_size as usize, - &command.input, - &command.output, - command.threads, - ), - args::MinFmtPreset::S2m => minimisers::seq_to_min( - command.w_size as usize, - command.m_size as usize, - &command.input, - &command.output, - command.threads, - ), - } - } - args::Commands::Ctr(command) => { - create_directory(&command.output).unwrap(); - let mut ctr = - counter::CountComputer::new(command.input, command.output, command.k_size as usize); - if command.threads > 0 { - ctr.set_threads(command.threads); - } - if command.acgt { - ctr.set_acgt_output(true); - } - ctr.set_max_memory(command.memory as f64); - ctr.count(); - ctr.merge(true); - } - } + let cli = Cli::parse(); + args::cli(cli); } diff --git a/pykmertools/Cargo.toml b/pykmertools/Cargo.toml index dd6d1eb..33999a9 100644 --- a/pykmertools/Cargo.toml +++ b/pykmertools/Cargo.toml @@ -16,5 +16,7 @@ test = false [dependencies] pyo3 = { version = "0.22.0", "features" = ["abi3-py39"] } rayon = "1.10.0" +clap = { version = "4.5.4" } composition = { path = "../composition" } kmer = { path = "../kmer" } +kmertools = { path = "../kmertools" } diff --git a/pykmertools/pyproject.toml b/pykmertools/pyproject.toml index 7e98503..e45b77e 100644 --- a/pykmertools/pyproject.toml +++ b/pykmertools/pyproject.toml @@ -21,3 +21,6 @@ Documentation = "https://github.com/anuradhawick/kmertools/wiki" [tool.maturin] features = ["pyo3/extension-module"] + +[project.scripts] +kmertools = "pykmertools:run_cli" diff --git a/pykmertools/src/cov.rs b/pykmertools/src/cov.rs deleted file mode 100644 index 666bbf0..0000000 --- a/pykmertools/src/cov.rs +++ /dev/null @@ -1,13 +0,0 @@ -use pyo3::prelude::*; - -#[pyclass] -pub struct CovComputer {} - -#[pymethods] -impl CovComputer { - #[new] - #[pyo3(signature = ())] - fn new() -> Self { - Self {} - } -} diff --git a/pykmertools/src/kmer.rs b/pykmertools/src/kmer.rs new file mode 100644 index 0000000..3b599cb --- /dev/null +++ b/pykmertools/src/kmer.rs @@ -0,0 +1,44 @@ +use std::{mem::transmute, sync::Arc}; + +use kmer::{kmer::KmerGenerator as RsKmerGenerator, numeric_to_kmer, Kmer}; +use pyo3::prelude::*; + +/// Computer for generating k-mers +#[pyclass] +pub struct KmerGenerator { + _data: Arc<[u8]>, + _kg: RsKmerGenerator<'static>, + ksize: usize, +} + +#[pymethods] +impl KmerGenerator { + /// Initialise the kmer iterator + /// Attributes: + /// seq (str): string from which to extract k-mers + /// ksize (int): size of the k-mers to count + #[new] + #[pyo3(signature = (seq, ksize))] + pub fn new(seq: String, ksize: usize) -> Self { + let _data: Arc<[u8]> = Arc::from(seq.into_boxed_str().into_boxed_bytes()); + let static_str: &'static [u8] = unsafe { transmute(Arc::as_ref(&_data)) }; + let _kg = RsKmerGenerator::new(static_str, ksize); + Self { _kg, _data, ksize } + } + + /// Translate numeric k-mer to ACGT + /// Attributes: + /// kmer (int): value of the k-mer + #[pyo3(signature = (kmer))] + pub fn to_acgt(&self, kmer: u64) -> String { + numeric_to_kmer(kmer, self.ksize) + } + + pub fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> { + slf + } + + pub fn __next__(mut slf: PyRefMut<'_, Self>) -> Option<(Kmer, Kmer)> { + slf._kg.next() + } +} diff --git a/pykmertools/src/lib.rs b/pykmertools/src/lib.rs index 5e718e9..f4eb3f2 100644 --- a/pykmertools/src/lib.rs +++ b/pykmertools/src/lib.rs @@ -1,19 +1,40 @@ mod cgr; -mod cov; +mod kmer; +mod min; mod oligo; use cgr::CgrComputer; +use clap::Parser; +use kmer::KmerGenerator; +use kmertools::args::{cli, Cli}; +use min::MinimiserGenerator; use oligo::OligoComputer; use pyo3::prelude::*; +#[pyfunction] +// TODO: remove after https://github.com/PyO3/maturin/issues/368 is resolved +fn run_cli(_py: Python) -> PyResult<()> { + let args: Vec<_> = std::env::args_os().skip(1).collect(); + let parsed_args = Cli::parse_from(&args); + cli(parsed_args); + Ok(()) +} + /// Pykmertools: kmertools python wrapper /// Modules: -/// OligoComputer - computing oligonucleotide frequency vectors -/// from DNA sequences -/// CgrComputer - computing chaos game representations -/// for DNA sequences +/// OligoComputer - computing oligonucleotide frequency vectors +/// from DNA sequences +/// CgrComputer - computing chaos game representations +/// for DNA sequences +/// KmerGenerator - an iterator object to generate k-mers +/// as (forward, reverse) numeric kmer tuples +/// MinimiserGenerator - an iterator object to iterate minimisers +/// as (kmer, start, end) numeric minimiser tuples #[pymodule] fn pykmertools(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class:: ()?; + m.add_class:: ()?; + m.add_class:: ()?; + m.add_function(wrap_pyfunction!(run_cli, m)?)?; Ok(()) } diff --git a/pykmertools/src/min.rs b/pykmertools/src/min.rs new file mode 100644 index 0000000..3448743 --- /dev/null +++ b/pykmertools/src/min.rs @@ -0,0 +1,48 @@ +use std::{mem::transmute, sync::Arc}; + +use kmer::{minimiser::MinimiserGenerator as RsMinimiserGenerator, numeric_to_kmer, Kmer}; +use pyo3::prelude::*; + +/// Computer for generating k-mers +#[pyclass] +pub struct MinimiserGenerator { + _data: Arc<[u8]>, + _mg: RsMinimiserGenerator<'static>, + msize: usize, +} + +#[pymethods] +impl MinimiserGenerator { + /// Initialise the kmer iterator + /// Attributes: + /// seq (str): string from which to extract k-mers + /// wsize (int): size of the window + /// msize (int): size of the minimiser + #[new] + #[pyo3(signature = (seq, wsize, msize))] + pub fn new(seq: String, wsize: usize, msize: usize) -> Self { + let _data: Arc<[u8]> = Arc::from(seq.into_boxed_str().into_boxed_bytes()); + let static_str: &'static [u8] = unsafe { transmute(Arc::as_ref(&_data)) }; + let _mg = RsMinimiserGenerator::new(static_str, wsize, msize); + Self { _mg, _data, msize } + } + + /// Translate numeric k-mer to ACGT + /// Attributes: + /// kmer (int): value of the k-mer + #[pyo3(signature = (kmer))] + pub fn to_acgt(&self, kmer: u64) -> String { + numeric_to_kmer(kmer, self.msize) + } + + pub fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> { + slf + } + + /// Translate numeric k-mer to ACGT + /// Returns: + /// Tuple[int, int, int, list[int]]: minimiser, start pos, end pos, k-mers in the window + pub fn __next__(mut slf: PyRefMut<'_, Self>) -> Option<(Kmer, usize, usize)> { + slf._mg.next() + } +}