diff --git a/Cargo.lock b/Cargo.lock index 8f328b5..4389ff8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -777,8 +777,10 @@ dependencies = [ name = "pykmertools" version = "0.1.2" dependencies = [ + "clap", "composition", "kmer", + "kmertools", "pyo3", "rayon", ] diff --git a/README.md b/README.md index 787fa01..9b2bdf3 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ [![Conda](https://img.shields.io/conda/v/bioconda/kmertools)](https://anaconda.org/bioconda/kmertools) [![Conda](https://img.shields.io/conda/dn/bioconda/kmertools)](https://anaconda.org/bioconda/kmertools) [![codecov](https://codecov.io/gh/anuradhawick/kmertools/graph/badge.svg?token=IDGRE54SSQ)](https://codecov.io/gh/anuradhawick/kmertools) +![PyPI - Version](https://img.shields.io/pypi/v/pykmertools)
@@ -25,7 +26,11 @@ $$ | \$$\ $$ | $$ | $$ |\$$$$$$$\ $$ |         $$ |   \$$$$$$  |\$$$$$$  |$$ |$$
 
 `kmertools` is a k-mer based feature extraction tool designed to support metagenomics and other bioinformatics analytics. This tool leverages k-mer analysis to vectorize DNA sequences, facilitating the use of these vectors in various AI/ML applications.
 
-**NEW:** `kmertools` is now available on bioconda at [https://anaconda.org/bioconda/kmertools](https://anaconda.org/bioconda/kmertools).
+**NEW** 
+
+* `kmertools` is now available on BioConda at [https://anaconda.org/bioconda/kmertools](https://anaconda.org/bioconda/kmertools).
+* `kmertools` is now available on PyPI at [https://pypi.org/project/pykmertools/](https://pypi.org/project/pykmertools/).
+* `kmertools` now provide python bindings. Read more in our [Wiki](https://github.com/anuradhawick/kmertools/wiki).
 
 ## Features
 
@@ -33,6 +38,7 @@ $$ | \$$\ $$ | $$ | $$ |\$$$$$$$\ $$ |         $$ |   \$$$$$$  |\$$$$$$  |$$ |$$
 - **Minimiser Binning:** Efficiently bin sequences using minimisers to reduce data complexity.
 - **Chaos Game Representation (CGR):** Compute CGR vectors for DNA sequences based on k-mers or whole sequence transformation.
 - **Coverage Histograms:** Create coverage histograms to analyze the depth of sequencing reads.
+- **Python Binding:** You can import kmertools functionality using `import pykmertools as kt`
 
 ## Installation
 
@@ -47,8 +53,15 @@ conda create -n kmertools -c bioconda kmertools
 # activate environment
 conda activate kmertools
 ```
+### Option 2: from PyPI
+
+You can install `kmertools` from PyPI at https://pypi.org/project/pykmertools/.
+
+```bash
+pip install pykmertools
+```
 
-### Option 2: from sources
+### Option 3: from sources
 
 You can install `kmertools` directly from the source by cloning the repository and using Rust's package manager `cargo`.
 
diff --git a/kmer/src/kmer_minimisers.rs b/kmer/src/kmer_minimisers.rs
index 3808cb8..d9f82f0 100644
--- a/kmer/src/kmer_minimisers.rs
+++ b/kmer/src/kmer_minimisers.rs
@@ -3,6 +3,7 @@ use std::cmp::min;
 use std::collections::VecDeque;
 use std::iter::Iterator;
 
+// TODO I cannot remember why I made this, probably a test script, but well.
 // https://github.com/lh3/minimap2/blob/0cc3cdca27f050fb80a19c90d25ecc6ab0b0907b/sketch.c#L9C1-L26C3
 const SEQ_NT4_TABLE: [u8; 256] = [
     0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
diff --git a/kmer/src/lib.rs b/kmer/src/lib.rs
index 60ca5da..2342f1a 100644
--- a/kmer/src/lib.rs
+++ b/kmer/src/lib.rs
@@ -1,5 +1,4 @@
 pub mod kmer;
-pub mod kmer_minimisers;
 pub mod minimiser;
 pub type Kmer = u64;
 
diff --git a/kmertools/src/args.rs b/kmertools/src/args.rs
index 879df59..1eba8bb 100644
--- a/kmertools/src/args.rs
+++ b/kmertools/src/args.rs
@@ -1,4 +1,8 @@
 use clap::{Args, Parser, Subcommand, ValueEnum};
+use composition::{cgr::CgrComputer, oligo::OligoComputer, oligocgr::OligoCgrComputer};
+use coverage::CovComputer;
+use ktio::fops::create_directory;
+use misc::minimisers;
 
 const ABOUT: &str = "kmertools: DNA vectorisation
 
@@ -226,3 +230,131 @@ pub struct CounterCommand {
     #[arg(short, long, default_value_t = 0)]
     pub threads: usize,
 }
+
+#[cfg(not(tarpaulin_include))]
+pub fn cli(cli: Cli) {
+    match cli.command {
+        Commands::Comp { command } => match command {
+            CompositionCommands::Oligo(command) => {
+                let mut com =
+                    OligoComputer::new(command.input, command.output, command.k_size as usize);
+                if command.threads > 0 {
+                    com.set_threads(command.threads);
+                }
+                com.set_norm(!command.counts);
+                com.set_header(command.header);
+
+                match command.preset {
+                    VecFmtPreset::Csv => com.set_delim(",".to_owned()),
+                    VecFmtPreset::Spc => com.set_delim(" ".to_owned()),
+                    VecFmtPreset::Tsv => com.set_delim("\t".to_owned()),
+                }
+                if let Err(e) = com.vectorise() {
+                    eprintln!("Error: {}", e);
+                }
+            }
+            CompositionCommands::Cgr(command) => {
+                if let Some(ksize) = command.k_size {
+                    let vecsize = command
+                        .vec_size
+                        .unwrap_or((ksize as f64).powf(4.0).powf(0.5) as u64)
+                        as usize;
+                    let mut cgr = OligoCgrComputer::new(
+                        command.input,
+                        command.output,
+                        ksize as usize,
+                        vecsize,
+                    );
+                    if command.threads > 0 {
+                        cgr.set_threads(command.threads);
+                    }
+                    cgr.set_norm(!command.counts);
+                    if let Err(e) = cgr.vectorise() {
+                        eprintln!("Error: {}", e);
+                    }
+                } else {
+                    if command.counts {
+                        eprintln!("Error: cannot use counts in whole sequence CGR!");
+                        return;
+                    }
+                    let vecsize = command.vec_size.unwrap_or(1) as usize;
+                    let mut cgr = CgrComputer::new(command.input, command.output, vecsize);
+                    if command.threads > 0 {
+                        cgr.set_threads(command.threads);
+                    }
+                    if let Err(e) = cgr.vectorise() {
+                        eprintln!("Error: {}", e);
+                    }
+                }
+            }
+        },
+        Commands::Cov(command) => {
+            create_directory(&command.output).unwrap();
+            let mut cov = CovComputer::new(
+                command.input,
+                command.output,
+                command.k_size as usize,
+                command.bin_size as usize,
+                command.bin_count as usize,
+            );
+            if command.threads > 0 {
+                cov.set_threads(command.threads);
+            }
+            if let Some(path) = command.alt_input {
+                cov.set_kmer_path(path);
+            }
+            if command.counts {
+                cov.set_norm(false);
+            }
+            cov.set_max_memory(command.memory as f64);
+            match command.preset {
+                VecFmtPreset::Csv => cov.set_delim(",".to_owned()),
+                VecFmtPreset::Spc => cov.set_delim(" ".to_owned()),
+                VecFmtPreset::Tsv => cov.set_delim("\t".to_owned()),
+            }
+            cov.build_table().unwrap();
+            cov.compute_coverages();
+        }
+        Commands::Min(command) => {
+            if command.w_size <= command.m_size && command.w_size > 0 {
+                eprintln!("Window size must be longer than minimiser size!");
+                return;
+            }
+            if command.m_size >= 31 {
+                eprintln!("Minimisers longer than 30 bases not allowed!");
+                return;
+            }
+
+            match command.preset {
+                MinFmtPreset::M2s => minimisers::bin_sequences(
+                    command.w_size as usize,
+                    command.m_size as usize,
+                    &command.input,
+                    &command.output,
+                    command.threads,
+                ),
+                MinFmtPreset::S2m => minimisers::seq_to_min(
+                    command.w_size as usize,
+                    command.m_size as usize,
+                    &command.input,
+                    &command.output,
+                    command.threads,
+                ),
+            }
+        }
+        Commands::Ctr(command) => {
+            create_directory(&command.output).unwrap();
+            let mut ctr =
+                counter::CountComputer::new(command.input, command.output, command.k_size as usize);
+            if command.threads > 0 {
+                ctr.set_threads(command.threads);
+            }
+            if command.acgt {
+                ctr.set_acgt_output(true);
+            }
+            ctr.set_max_memory(command.memory as f64);
+            ctr.count();
+            ctr.merge(true);
+        }
+    }
+}
diff --git a/kmertools/src/lib.rs b/kmertools/src/lib.rs
new file mode 100644
index 0000000..6e10f4a
--- /dev/null
+++ b/kmertools/src/lib.rs
@@ -0,0 +1 @@
+pub mod args;
diff --git a/kmertools/src/main.rs b/kmertools/src/main.rs
index aefdc18..017f36a 100644
--- a/kmertools/src/main.rs
+++ b/kmertools/src/main.rs
@@ -1,139 +1,9 @@
-use args::CompositionCommands;
+use args::Cli;
 use clap::Parser;
-use composition::oligo::OligoComputer;
-use coverage::CovComputer;
-use misc::minimisers;
 mod args;
 
 #[cfg(not(tarpaulin_include))]
 fn main() {
-    use composition::{cgr::CgrComputer, oligocgr::OligoCgrComputer};
-    use ktio::fops::create_directory;
-
-    let cli = args::Cli::parse();
-
-    match cli.command {
-        args::Commands::Comp { command } => match command {
-            CompositionCommands::Oligo(command) => {
-                let mut com =
-                    OligoComputer::new(command.input, command.output, command.k_size as usize);
-                if command.threads > 0 {
-                    com.set_threads(command.threads);
-                }
-                com.set_norm(!command.counts);
-                com.set_header(command.header);
-
-                match command.preset {
-                    args::VecFmtPreset::Csv => com.set_delim(",".to_owned()),
-                    args::VecFmtPreset::Spc => com.set_delim(" ".to_owned()),
-                    args::VecFmtPreset::Tsv => com.set_delim("\t".to_owned()),
-                }
-                if let Err(e) = com.vectorise() {
-                    eprintln!("Error: {}", e);
-                }
-            }
-            CompositionCommands::Cgr(command) => {
-                if let Some(ksize) = command.k_size {
-                    let vecsize = command
-                        .vec_size
-                        .unwrap_or((ksize as f64).powf(4.0).powf(0.5) as u64)
-                        as usize;
-                    let mut cgr = OligoCgrComputer::new(
-                        command.input,
-                        command.output,
-                        ksize as usize,
-                        vecsize,
-                    );
-                    if command.threads > 0 {
-                        cgr.set_threads(command.threads);
-                    }
-                    cgr.set_norm(!command.counts);
-                    if let Err(e) = cgr.vectorise() {
-                        eprintln!("Error: {}", e);
-                    }
-                } else {
-                    if command.counts {
-                        eprintln!("Error: cannot use counts in whole sequence CGR!");
-                        return;
-                    }
-                    let vecsize = command.vec_size.unwrap_or(1) as usize;
-                    let mut cgr = CgrComputer::new(command.input, command.output, vecsize);
-                    if command.threads > 0 {
-                        cgr.set_threads(command.threads);
-                    }
-                    if let Err(e) = cgr.vectorise() {
-                        eprintln!("Error: {}", e);
-                    }
-                }
-            }
-        },
-        args::Commands::Cov(command) => {
-            create_directory(&command.output).unwrap();
-            let mut cov = CovComputer::new(
-                command.input,
-                command.output,
-                command.k_size as usize,
-                command.bin_size as usize,
-                command.bin_count as usize,
-            );
-            if command.threads > 0 {
-                cov.set_threads(command.threads);
-            }
-            if let Some(path) = command.alt_input {
-                cov.set_kmer_path(path);
-            }
-            if command.counts {
-                cov.set_norm(false);
-            }
-            cov.set_max_memory(command.memory as f64);
-            match command.preset {
-                args::VecFmtPreset::Csv => cov.set_delim(",".to_owned()),
-                args::VecFmtPreset::Spc => cov.set_delim(" ".to_owned()),
-                args::VecFmtPreset::Tsv => cov.set_delim("\t".to_owned()),
-            }
-            cov.build_table().unwrap();
-            cov.compute_coverages();
-        }
-        args::Commands::Min(command) => {
-            if command.w_size <= command.m_size && command.w_size > 0 {
-                eprintln!("Window size must be longer than minimiser size!");
-                return;
-            }
-            if command.m_size >= 31 {
-                eprintln!("Minimisers longer than 30 bases not allowed!");
-                return;
-            }
-
-            match command.preset {
-                args::MinFmtPreset::M2s => minimisers::bin_sequences(
-                    command.w_size as usize,
-                    command.m_size as usize,
-                    &command.input,
-                    &command.output,
-                    command.threads,
-                ),
-                args::MinFmtPreset::S2m => minimisers::seq_to_min(
-                    command.w_size as usize,
-                    command.m_size as usize,
-                    &command.input,
-                    &command.output,
-                    command.threads,
-                ),
-            }
-        }
-        args::Commands::Ctr(command) => {
-            create_directory(&command.output).unwrap();
-            let mut ctr =
-                counter::CountComputer::new(command.input, command.output, command.k_size as usize);
-            if command.threads > 0 {
-                ctr.set_threads(command.threads);
-            }
-            if command.acgt {
-                ctr.set_acgt_output(true);
-            }
-            ctr.set_max_memory(command.memory as f64);
-            ctr.count();
-            ctr.merge(true);
-        }
-    }
+    let cli = Cli::parse();
+    args::cli(cli);
 }
diff --git a/pykmertools/Cargo.toml b/pykmertools/Cargo.toml
index dd6d1eb..33999a9 100644
--- a/pykmertools/Cargo.toml
+++ b/pykmertools/Cargo.toml
@@ -16,5 +16,7 @@ test = false
 [dependencies]
 pyo3 = { version = "0.22.0", "features" = ["abi3-py39"] }
 rayon = "1.10.0"
+clap = { version = "4.5.4" }
 composition = { path = "../composition" }
 kmer = { path = "../kmer" }
+kmertools = { path = "../kmertools" }
diff --git a/pykmertools/pyproject.toml b/pykmertools/pyproject.toml
index 7e98503..e45b77e 100644
--- a/pykmertools/pyproject.toml
+++ b/pykmertools/pyproject.toml
@@ -21,3 +21,6 @@ Documentation = "https://github.com/anuradhawick/kmertools/wiki"
 
 [tool.maturin]
 features = ["pyo3/extension-module"]
+
+[project.scripts]
+kmertools = "pykmertools:run_cli"
diff --git a/pykmertools/src/cov.rs b/pykmertools/src/cov.rs
deleted file mode 100644
index 666bbf0..0000000
--- a/pykmertools/src/cov.rs
+++ /dev/null
@@ -1,13 +0,0 @@
-use pyo3::prelude::*;
-
-#[pyclass]
-pub struct CovComputer {}
-
-#[pymethods]
-impl CovComputer {
-    #[new]
-    #[pyo3(signature = ())]
-    fn new() -> Self {
-        Self {}
-    }
-}
diff --git a/pykmertools/src/kmer.rs b/pykmertools/src/kmer.rs
new file mode 100644
index 0000000..3b599cb
--- /dev/null
+++ b/pykmertools/src/kmer.rs
@@ -0,0 +1,44 @@
+use std::{mem::transmute, sync::Arc};
+
+use kmer::{kmer::KmerGenerator as RsKmerGenerator, numeric_to_kmer, Kmer};
+use pyo3::prelude::*;
+
+/// Computer for generating k-mers
+#[pyclass]
+pub struct KmerGenerator {
+    _data: Arc<[u8]>,
+    _kg: RsKmerGenerator<'static>,
+    ksize: usize,
+}
+
+#[pymethods]
+impl KmerGenerator {
+    /// Initialise the kmer iterator
+    /// Attributes:
+    ///     seq (str): string from which to extract k-mers
+    ///     ksize (int): size of the k-mers to count
+    #[new]
+    #[pyo3(signature = (seq, ksize))]
+    pub fn new(seq: String, ksize: usize) -> Self {
+        let _data: Arc<[u8]> = Arc::from(seq.into_boxed_str().into_boxed_bytes());
+        let static_str: &'static [u8] = unsafe { transmute(Arc::as_ref(&_data)) };
+        let _kg = RsKmerGenerator::new(static_str, ksize);
+        Self { _kg, _data, ksize }
+    }
+
+    /// Translate numeric k-mer to ACGT
+    /// Attributes:
+    ///     kmer (int): value of the k-mer
+    #[pyo3(signature = (kmer))]
+    pub fn to_acgt(&self, kmer: u64) -> String {
+        numeric_to_kmer(kmer, self.ksize)
+    }
+
+    pub fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> {
+        slf
+    }
+
+    pub fn __next__(mut slf: PyRefMut<'_, Self>) -> Option<(Kmer, Kmer)> {
+        slf._kg.next()
+    }
+}
diff --git a/pykmertools/src/lib.rs b/pykmertools/src/lib.rs
index 5e718e9..f4eb3f2 100644
--- a/pykmertools/src/lib.rs
+++ b/pykmertools/src/lib.rs
@@ -1,19 +1,40 @@
 mod cgr;
-mod cov;
+mod kmer;
+mod min;
 mod oligo;
 use cgr::CgrComputer;
+use clap::Parser;
+use kmer::KmerGenerator;
+use kmertools::args::{cli, Cli};
+use min::MinimiserGenerator;
 use oligo::OligoComputer;
 use pyo3::prelude::*;
 
+#[pyfunction]
+// TODO: remove after https://github.com/PyO3/maturin/issues/368 is resolved
+fn run_cli(_py: Python) -> PyResult<()> {
+    let args: Vec<_> = std::env::args_os().skip(1).collect();
+    let parsed_args = Cli::parse_from(&args);
+    cli(parsed_args);
+    Ok(())
+}
+
 /// Pykmertools: kmertools python wrapper
 /// Modules:
-///     OligoComputer - computing oligonucleotide frequency vectors
-///                     from DNA sequences
-///     CgrComputer   - computing chaos game representations
-///                     for DNA sequences
+///     OligoComputer      - computing oligonucleotide frequency vectors
+///                          from DNA sequences
+///     CgrComputer        - computing chaos game representations
+///                           for DNA sequences
+///     KmerGenerator      - an iterator object to generate k-mers
+///                          as (forward, reverse) numeric kmer tuples
+///     MinimiserGenerator - an iterator object to iterate minimisers
+///                          as (kmer, start, end) numeric minimiser tuples
 #[pymodule]
 fn pykmertools(m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_class::()?;
     m.add_class::()?;
+    m.add_class::()?;
+    m.add_class::()?;
+    m.add_function(wrap_pyfunction!(run_cli, m)?)?;
     Ok(())
 }
diff --git a/pykmertools/src/min.rs b/pykmertools/src/min.rs
new file mode 100644
index 0000000..3448743
--- /dev/null
+++ b/pykmertools/src/min.rs
@@ -0,0 +1,48 @@
+use std::{mem::transmute, sync::Arc};
+
+use kmer::{minimiser::MinimiserGenerator as RsMinimiserGenerator, numeric_to_kmer, Kmer};
+use pyo3::prelude::*;
+
+/// Computer for generating k-mers
+#[pyclass]
+pub struct MinimiserGenerator {
+    _data: Arc<[u8]>,
+    _mg: RsMinimiserGenerator<'static>,
+    msize: usize,
+}
+
+#[pymethods]
+impl MinimiserGenerator {
+    /// Initialise the kmer iterator
+    /// Attributes:
+    ///     seq (str): string from which to extract k-mers
+    ///     wsize (int): size of the window
+    ///     msize (int): size of the minimiser
+    #[new]
+    #[pyo3(signature = (seq, wsize, msize))]
+    pub fn new(seq: String, wsize: usize, msize: usize) -> Self {
+        let _data: Arc<[u8]> = Arc::from(seq.into_boxed_str().into_boxed_bytes());
+        let static_str: &'static [u8] = unsafe { transmute(Arc::as_ref(&_data)) };
+        let _mg = RsMinimiserGenerator::new(static_str, wsize, msize);
+        Self { _mg, _data, msize }
+    }
+
+    /// Translate numeric k-mer to ACGT
+    /// Attributes:
+    ///     kmer (int): value of the k-mer
+    #[pyo3(signature = (kmer))]
+    pub fn to_acgt(&self, kmer: u64) -> String {
+        numeric_to_kmer(kmer, self.msize)
+    }
+
+    pub fn __iter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> {
+        slf
+    }
+
+    /// Translate numeric k-mer to ACGT
+    /// Returns:
+    ///     Tuple[int, int, int, list[int]]: minimiser, start pos, end pos, k-mers in the window
+    pub fn __next__(mut slf: PyRefMut<'_, Self>) -> Option<(Kmer, usize, usize)> {
+        slf._mg.next()
+    }
+}